LLVM  4.0.0
SLPVectorizer.cpp
Go to the documentation of this file.
1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
19 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/Statistic.h"
29 #include "llvm/IR/DataLayout.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IRBuilder.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/Module.h"
35 #include "llvm/IR/NoFolder.h"
36 #include "llvm/IR/Type.h"
37 #include "llvm/IR/Value.h"
38 #include "llvm/IR/Verifier.h"
39 #include "llvm/Pass.h"
41 #include "llvm/Support/Debug.h"
44 #include <algorithm>
45 #include <memory>
46 
47 using namespace llvm;
48 using namespace slpvectorizer;
49 
50 #define SV_NAME "slp-vectorizer"
51 #define DEBUG_TYPE "SLP"
52 
53 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
54 
55 static cl::opt<int>
56  SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
57  cl::desc("Only vectorize if you gain more than this "
58  "number "));
59 
60 static cl::opt<bool>
61 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
62  cl::desc("Attempt to vectorize horizontal reductions"));
63 
65  "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
66  cl::desc(
67  "Attempt to vectorize horizontal reductions feeding into a store"));
68 
69 static cl::opt<int>
70 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
71  cl::desc("Attempt to vectorize for this register size in bits"));
72 
73 /// Limits the size of scheduling regions in a block.
74 /// It avoid long compile times for _very_ large blocks where vector
75 /// instructions are spread over a wide range.
76 /// This limit is way higher than needed by real-world functions.
77 static cl::opt<int>
78 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
79  cl::desc("Limit the size of the SLP scheduling region per block"));
80 
82  "slp-min-reg-size", cl::init(128), cl::Hidden,
83  cl::desc("Attempt to vectorize for this register size in bits"));
84 
86  "slp-recursion-max-depth", cl::init(12), cl::Hidden,
87  cl::desc("Limit the recursion depth when building a vectorizable tree"));
88 
90  "slp-min-tree-size", cl::init(3), cl::Hidden,
91  cl::desc("Only vectorize small trees if they are fully vectorizable"));
92 
93 // Limit the number of alias checks. The limit is chosen so that
94 // it has no negative effect on the llvm benchmarks.
95 static const unsigned AliasedCheckLimit = 10;
96 
97 // Another limit for the alias checks: The maximum distance between load/store
98 // instructions where alias checks are done.
99 // This limit is useful for very large basic blocks.
100 static const unsigned MaxMemDepDistance = 160;
101 
102 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
103 /// regions to be handled.
104 static const int MinScheduleRegionSize = 16;
105 
106 /// \brief Predicate for the element types that the SLP vectorizer supports.
107 ///
108 /// The most important thing to filter here are types which are invalid in LLVM
109 /// vectors. We also filter target specific types which have absolutely no
110 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
111 /// avoids spending time checking the cost model and realizing that they will
112 /// be inevitably scalarized.
113 static bool isValidElementType(Type *Ty) {
114  return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
115  !Ty->isPPC_FP128Ty();
116 }
117 
118 /// \returns true if all of the instructions in \p VL are in the same block or
119 /// false otherwise.
121  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
122  if (!I0)
123  return false;
124  BasicBlock *BB = I0->getParent();
125  for (int i = 1, e = VL.size(); i < e; i++) {
127  if (!I)
128  return false;
129 
130  if (BB != I->getParent())
131  return false;
132  }
133  return true;
134 }
135 
136 /// \returns True if all of the values in \p VL are constants.
138  for (Value *i : VL)
139  if (!isa<Constant>(i))
140  return false;
141  return true;
142 }
143 
144 /// \returns True if all of the values in \p VL are identical.
145 static bool isSplat(ArrayRef<Value *> VL) {
146  for (unsigned i = 1, e = VL.size(); i < e; ++i)
147  if (VL[i] != VL[0])
148  return false;
149  return true;
150 }
151 
152 ///\returns Opcode that can be clubbed with \p Op to create an alternate
153 /// sequence which can later be merged as a ShuffleVector instruction.
154 static unsigned getAltOpcode(unsigned Op) {
155  switch (Op) {
156  case Instruction::FAdd:
157  return Instruction::FSub;
158  case Instruction::FSub:
159  return Instruction::FAdd;
160  case Instruction::Add:
161  return Instruction::Sub;
162  case Instruction::Sub:
163  return Instruction::Add;
164  default:
165  return 0;
166  }
167 }
168 
169 ///\returns bool representing if Opcode \p Op can be part
170 /// of an alternate sequence which can later be merged as
171 /// a ShuffleVector instruction.
172 static bool canCombineAsAltInst(unsigned Op) {
173  return Op == Instruction::FAdd || Op == Instruction::FSub ||
174  Op == Instruction::Sub || Op == Instruction::Add;
175 }
176 
177 /// \returns ShuffleVector instruction if instructions in \p VL have
178 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
179 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
180 static unsigned isAltInst(ArrayRef<Value *> VL) {
181  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
182  unsigned Opcode = I0->getOpcode();
183  unsigned AltOpcode = getAltOpcode(Opcode);
184  for (int i = 1, e = VL.size(); i < e; i++) {
186  if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
187  return 0;
188  }
189  return Instruction::ShuffleVector;
190 }
191 
192 /// \returns The opcode if all of the Instructions in \p VL have the same
193 /// opcode, or zero.
194 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
195  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
196  if (!I0)
197  return 0;
198  unsigned Opcode = I0->getOpcode();
199  for (int i = 1, e = VL.size(); i < e; i++) {
201  if (!I || Opcode != I->getOpcode()) {
202  if (canCombineAsAltInst(Opcode) && i == 1)
203  return isAltInst(VL);
204  return 0;
205  }
206  }
207  return Opcode;
208 }
209 
210 /// Get the intersection (logical and) of all of the potential IR flags
211 /// of each scalar operation (VL) that will be converted into a vector (I).
212 /// Flag set: NSW, NUW, exact, and all of fast-math.
214  if (auto *VecOp = dyn_cast<Instruction>(I)) {
215  if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
216  // Intersection is initialized to the 0th scalar,
217  // so start counting from index '1'.
218  for (int i = 1, e = VL.size(); i < e; ++i) {
219  if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
220  Intersection->andIRFlags(Scalar);
221  }
222  VecOp->copyIRFlags(Intersection);
223  }
224  }
225 }
226 
227 /// \returns true if all of the values in \p VL have the same type or false
228 /// otherwise.
230  Type *Ty = VL[0]->getType();
231  for (int i = 1, e = VL.size(); i < e; i++)
232  if (VL[i]->getType() != Ty)
233  return false;
234 
235  return true;
236 }
237 
238 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
239 static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
240  assert(Opcode == Instruction::ExtractElement ||
241  Opcode == Instruction::ExtractValue);
242  if (Opcode == Instruction::ExtractElement) {
244  return CI && CI->getZExtValue() == Idx;
245  } else {
246  ExtractValueInst *EI = cast<ExtractValueInst>(E);
247  return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
248  }
249 }
250 
251 /// \returns True if in-tree use also needs extract. This refers to
252 /// possible scalar operand in vectorized instruction.
254  TargetLibraryInfo *TLI) {
255 
256  unsigned Opcode = UserInst->getOpcode();
257  switch (Opcode) {
258  case Instruction::Load: {
259  LoadInst *LI = cast<LoadInst>(UserInst);
260  return (LI->getPointerOperand() == Scalar);
261  }
262  case Instruction::Store: {
263  StoreInst *SI = cast<StoreInst>(UserInst);
264  return (SI->getPointerOperand() == Scalar);
265  }
266  case Instruction::Call: {
267  CallInst *CI = cast<CallInst>(UserInst);
269  if (hasVectorInstrinsicScalarOpd(ID, 1)) {
270  return (CI->getArgOperand(1) == Scalar);
271  }
272  }
273  default:
274  return false;
275  }
276 }
277 
278 /// \returns the AA location that is being access by the instruction.
280  if (StoreInst *SI = dyn_cast<StoreInst>(I))
281  return MemoryLocation::get(SI);
282  if (LoadInst *LI = dyn_cast<LoadInst>(I))
283  return MemoryLocation::get(LI);
284  return MemoryLocation();
285 }
286 
287 /// \returns True if the instruction is not a volatile or atomic load/store.
288 static bool isSimple(Instruction *I) {
289  if (LoadInst *LI = dyn_cast<LoadInst>(I))
290  return LI->isSimple();
291  if (StoreInst *SI = dyn_cast<StoreInst>(I))
292  return SI->isSimple();
293  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
294  return !MI->isVolatile();
295  return true;
296 }
297 
298 namespace llvm {
299 namespace slpvectorizer {
300 /// Bottom Up SLP Vectorizer.
301 class BoUpSLP {
302 public:
307 
311  const DataLayout *DL)
312  : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
313  SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
314  DL(DL), Builder(Se->getContext()) {
315  CodeMetrics::collectEphemeralValues(F, AC, EphValues);
316  // Use the vector register size specified by the target unless overridden
317  // by a command-line option.
318  // TODO: It would be better to limit the vectorization factor based on
319  // data type rather than just register size. For example, x86 AVX has
320  // 256-bit registers, but it does not support integer operations
321  // at that width (that requires AVX2).
322  if (MaxVectorRegSizeOption.getNumOccurrences())
323  MaxVecRegSize = MaxVectorRegSizeOption;
324  else
325  MaxVecRegSize = TTI->getRegisterBitWidth(true);
326 
327  MinVecRegSize = MinVectorRegSizeOption;
328  }
329 
330  /// \brief Vectorize the tree that starts with the elements in \p VL.
331  /// Returns the vectorized root.
332  Value *vectorizeTree();
333 
334  /// \returns the cost incurred by unwanted spills and fills, caused by
335  /// holding live values over call sites.
336  int getSpillCost();
337 
338  /// \returns the vectorization cost of the subtree that starts at \p VL.
339  /// A negative number means that this is profitable.
340  int getTreeCost();
341 
342  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
343  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
344  void buildTree(ArrayRef<Value *> Roots,
345  ArrayRef<Value *> UserIgnoreLst = None);
346 
347  /// Clear the internal data structures that are created by 'buildTree'.
348  void deleteTree() {
349  VectorizableTree.clear();
350  ScalarToTreeEntry.clear();
351  MustGather.clear();
352  ExternalUses.clear();
353  NumLoadsWantToKeepOrder = 0;
354  NumLoadsWantToChangeOrder = 0;
355  for (auto &Iter : BlocksSchedules) {
356  BlockScheduling *BS = Iter.second.get();
357  BS->clear();
358  }
359  MinBWs.clear();
360  }
361 
362  /// \brief Perform LICM and CSE on the newly generated gather sequences.
363  void optimizeGatherSequence();
364 
365  /// \returns true if it is beneficial to reverse the vector order.
366  bool shouldReorder() const {
367  return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
368  }
369 
370  /// \return The vector element size in bits to use when vectorizing the
371  /// expression tree ending at \p V. If V is a store, the size is the width of
372  /// the stored value. Otherwise, the size is the width of the largest loaded
373  /// value reaching V. This method is used by the vectorizer to calculate
374  /// vectorization factors.
375  unsigned getVectorElementSize(Value *V);
376 
377  /// Compute the minimum type sizes required to represent the entries in a
378  /// vectorizable tree.
380 
381  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
382  unsigned getMaxVecRegSize() const {
383  return MaxVecRegSize;
384  }
385 
386  // \returns minimum vector register size as set by cl::opt.
387  unsigned getMinVecRegSize() const {
388  return MinVecRegSize;
389  }
390 
391  /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
392  ///
393  /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
394  unsigned canMapToVector(Type *T, const DataLayout &DL) const;
395 
396  /// \returns True if the VectorizableTree is both tiny and not fully
397  /// vectorizable. We do not vectorize such trees.
398  bool isTreeTinyAndNotFullyVectorizable();
399 
400 private:
401  struct TreeEntry;
402 
403  /// \returns the cost of the vectorizable entry.
404  int getEntryCost(TreeEntry *E);
405 
406  /// This is the recursive part of buildTree.
407  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
408 
409  /// \returns True if the ExtractElement/ExtractValue instructions in VL can
410  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
411  bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
412 
413  /// Vectorize a single entry in the tree.
414  Value *vectorizeTree(TreeEntry *E);
415 
416  /// Vectorize a single entry in the tree, starting in \p VL.
417  Value *vectorizeTree(ArrayRef<Value *> VL);
418 
419  /// \returns the pointer to the vectorized value if \p VL is already
420  /// vectorized, or NULL. They may happen in cycles.
421  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
422 
423  /// \returns the scalarization cost for this type. Scalarization in this
424  /// context means the creation of vectors from a group of scalars.
425  int getGatherCost(Type *Ty);
426 
427  /// \returns the scalarization cost for this list of values. Assuming that
428  /// this subtree gets vectorized, we may need to extract the values from the
429  /// roots. This method calculates the cost of extracting the values.
430  int getGatherCost(ArrayRef<Value *> VL);
431 
432  /// \brief Set the Builder insert point to one after the last instruction in
433  /// the bundle
434  void setInsertPointAfterBundle(ArrayRef<Value *> VL);
435 
436  /// \returns a vector from a collection of scalars in \p VL.
437  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
438 
439  /// \returns whether the VectorizableTree is fully vectorizable and will
440  /// be beneficial even the tree height is tiny.
441  bool isFullyVectorizableTinyTree();
442 
443  /// \reorder commutative operands in alt shuffle if they result in
444  /// vectorized code.
445  void reorderAltShuffleOperands(ArrayRef<Value *> VL,
447  SmallVectorImpl<Value *> &Right);
448  /// \reorder commutative operands to get better probability of
449  /// generating vectorized code.
450  void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
452  SmallVectorImpl<Value *> &Right);
453  struct TreeEntry {
454  TreeEntry() : Scalars(), VectorizedValue(nullptr),
455  NeedToGather(0) {}
456 
457  /// \returns true if the scalars in VL are equal to this entry.
458  bool isSame(ArrayRef<Value *> VL) const {
459  assert(VL.size() == Scalars.size() && "Invalid size");
460  return std::equal(VL.begin(), VL.end(), Scalars.begin());
461  }
462 
463  /// A vector of scalars.
464  ValueList Scalars;
465 
466  /// The Scalars are vectorized into this value. It is initialized to Null.
467  Value *VectorizedValue;
468 
469  /// Do we need to gather this sequence ?
470  bool NeedToGather;
471  };
472 
473  /// Create a new VectorizableTree entry.
474  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
475  VectorizableTree.emplace_back();
476  int idx = VectorizableTree.size() - 1;
477  TreeEntry *Last = &VectorizableTree[idx];
478  Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
479  Last->NeedToGather = !Vectorized;
480  if (Vectorized) {
481  for (int i = 0, e = VL.size(); i != e; ++i) {
482  assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
483  ScalarToTreeEntry[VL[i]] = idx;
484  }
485  } else {
486  MustGather.insert(VL.begin(), VL.end());
487  }
488  return Last;
489  }
490 
491  /// -- Vectorization State --
492  /// Holds all of the tree entries.
493  std::vector<TreeEntry> VectorizableTree;
494 
495  /// Maps a specific scalar to its tree entry.
496  SmallDenseMap<Value*, int> ScalarToTreeEntry;
497 
498  /// A list of scalars that we found that we need to keep as scalars.
499  ValueSet MustGather;
500 
501  /// This POD struct describes one external user in the vectorized tree.
502  struct ExternalUser {
503  ExternalUser (Value *S, llvm::User *U, int L) :
504  Scalar(S), User(U), Lane(L){}
505  // Which scalar in our function.
506  Value *Scalar;
507  // Which user that uses the scalar.
508  llvm::User *User;
509  // Which lane does the scalar belong to.
510  int Lane;
511  };
512  typedef SmallVector<ExternalUser, 16> UserList;
513 
514  /// Checks if two instructions may access the same memory.
515  ///
516  /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
517  /// is invariant in the calling loop.
518  bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
519  Instruction *Inst2) {
520 
521  // First check if the result is already in the cache.
522  AliasCacheKey key = std::make_pair(Inst1, Inst2);
523  Optional<bool> &result = AliasCache[key];
524  if (result.hasValue()) {
525  return result.getValue();
526  }
527  MemoryLocation Loc2 = getLocation(Inst2, AA);
528  bool aliased = true;
529  if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
530  // Do the alias check.
531  aliased = AA->alias(Loc1, Loc2);
532  }
533  // Store the result in the cache.
534  result = aliased;
535  return aliased;
536  }
537 
538  typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
539 
540  /// Cache for alias results.
541  /// TODO: consider moving this to the AliasAnalysis itself.
543 
544  /// Removes an instruction from its block and eventually deletes it.
545  /// It's like Instruction::eraseFromParent() except that the actual deletion
546  /// is delayed until BoUpSLP is destructed.
547  /// This is required to ensure that there are no incorrect collisions in the
548  /// AliasCache, which can happen if a new instruction is allocated at the
549  /// same address as a previously deleted instruction.
550  void eraseInstruction(Instruction *I) {
551  I->removeFromParent();
552  I->dropAllReferences();
553  DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
554  }
555 
556  /// Temporary store for deleted instructions. Instructions will be deleted
557  /// eventually when the BoUpSLP is destructed.
558  SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
559 
560  /// A list of values that need to extracted out of the tree.
561  /// This list holds pairs of (Internal Scalar : External User).
562  UserList ExternalUses;
563 
564  /// Values used only by @llvm.assume calls.
566 
567  /// Holds all of the instructions that we gathered.
568  SetVector<Instruction *> GatherSeq;
569  /// A list of blocks that we are going to CSE.
570  SetVector<BasicBlock *> CSEBlocks;
571 
572  /// Contains all scheduling relevant data for an instruction.
573  /// A ScheduleData either represents a single instruction or a member of an
574  /// instruction bundle (= a group of instructions which is combined into a
575  /// vector instruction).
576  struct ScheduleData {
577 
578  // The initial value for the dependency counters. It means that the
579  // dependencies are not calculated yet.
580  enum { InvalidDeps = -1 };
581 
582  ScheduleData()
583  : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
584  NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
585  Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
586  UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
587 
588  void init(int BlockSchedulingRegionID) {
589  FirstInBundle = this;
590  NextInBundle = nullptr;
591  NextLoadStore = nullptr;
592  IsScheduled = false;
593  SchedulingRegionID = BlockSchedulingRegionID;
594  UnscheduledDepsInBundle = UnscheduledDeps;
595  clearDependencies();
596  }
597 
598  /// Returns true if the dependency information has been calculated.
599  bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
600 
601  /// Returns true for single instructions and for bundle representatives
602  /// (= the head of a bundle).
603  bool isSchedulingEntity() const { return FirstInBundle == this; }
604 
605  /// Returns true if it represents an instruction bundle and not only a
606  /// single instruction.
607  bool isPartOfBundle() const {
608  return NextInBundle != nullptr || FirstInBundle != this;
609  }
610 
611  /// Returns true if it is ready for scheduling, i.e. it has no more
612  /// unscheduled depending instructions/bundles.
613  bool isReady() const {
614  assert(isSchedulingEntity() &&
615  "can't consider non-scheduling entity for ready list");
616  return UnscheduledDepsInBundle == 0 && !IsScheduled;
617  }
618 
619  /// Modifies the number of unscheduled dependencies, also updating it for
620  /// the whole bundle.
621  int incrementUnscheduledDeps(int Incr) {
622  UnscheduledDeps += Incr;
623  return FirstInBundle->UnscheduledDepsInBundle += Incr;
624  }
625 
626  /// Sets the number of unscheduled dependencies to the number of
627  /// dependencies.
628  void resetUnscheduledDeps() {
629  incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
630  }
631 
632  /// Clears all dependency information.
633  void clearDependencies() {
634  Dependencies = InvalidDeps;
635  resetUnscheduledDeps();
636  MemoryDependencies.clear();
637  }
638 
639  void dump(raw_ostream &os) const {
640  if (!isSchedulingEntity()) {
641  os << "/ " << *Inst;
642  } else if (NextInBundle) {
643  os << '[' << *Inst;
644  ScheduleData *SD = NextInBundle;
645  while (SD) {
646  os << ';' << *SD->Inst;
647  SD = SD->NextInBundle;
648  }
649  os << ']';
650  } else {
651  os << *Inst;
652  }
653  }
654 
655  Instruction *Inst;
656 
657  /// Points to the head in an instruction bundle (and always to this for
658  /// single instructions).
659  ScheduleData *FirstInBundle;
660 
661  /// Single linked list of all instructions in a bundle. Null if it is a
662  /// single instruction.
663  ScheduleData *NextInBundle;
664 
665  /// Single linked list of all memory instructions (e.g. load, store, call)
666  /// in the block - until the end of the scheduling region.
667  ScheduleData *NextLoadStore;
668 
669  /// The dependent memory instructions.
670  /// This list is derived on demand in calculateDependencies().
671  SmallVector<ScheduleData *, 4> MemoryDependencies;
672 
673  /// This ScheduleData is in the current scheduling region if this matches
674  /// the current SchedulingRegionID of BlockScheduling.
675  int SchedulingRegionID;
676 
677  /// Used for getting a "good" final ordering of instructions.
678  int SchedulingPriority;
679 
680  /// The number of dependencies. Constitutes of the number of users of the
681  /// instruction plus the number of dependent memory instructions (if any).
682  /// This value is calculated on demand.
683  /// If InvalidDeps, the number of dependencies is not calculated yet.
684  ///
685  int Dependencies;
686 
687  /// The number of dependencies minus the number of dependencies of scheduled
688  /// instructions. As soon as this is zero, the instruction/bundle gets ready
689  /// for scheduling.
690  /// Note that this is negative as long as Dependencies is not calculated.
691  int UnscheduledDeps;
692 
693  /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
694  /// single instructions.
695  int UnscheduledDepsInBundle;
696 
697  /// True if this instruction is scheduled (or considered as scheduled in the
698  /// dry-run).
699  bool IsScheduled;
700  };
701 
702 #ifndef NDEBUG
703  friend inline raw_ostream &operator<<(raw_ostream &os,
704  const BoUpSLP::ScheduleData &SD) {
705  SD.dump(os);
706  return os;
707  }
708 #endif
709 
710  /// Contains all scheduling data for a basic block.
711  ///
712  struct BlockScheduling {
713 
714  BlockScheduling(BasicBlock *BB)
715  : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
716  ScheduleStart(nullptr), ScheduleEnd(nullptr),
717  FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
718  ScheduleRegionSize(0),
719  ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
720  // Make sure that the initial SchedulingRegionID is greater than the
721  // initial SchedulingRegionID in ScheduleData (which is 0).
722  SchedulingRegionID(1) {}
723 
724  void clear() {
725  ReadyInsts.clear();
726  ScheduleStart = nullptr;
727  ScheduleEnd = nullptr;
728  FirstLoadStoreInRegion = nullptr;
729  LastLoadStoreInRegion = nullptr;
730 
731  // Reduce the maximum schedule region size by the size of the
732  // previous scheduling run.
733  ScheduleRegionSizeLimit -= ScheduleRegionSize;
734  if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
735  ScheduleRegionSizeLimit = MinScheduleRegionSize;
736  ScheduleRegionSize = 0;
737 
738  // Make a new scheduling region, i.e. all existing ScheduleData is not
739  // in the new region yet.
740  ++SchedulingRegionID;
741  }
742 
743  ScheduleData *getScheduleData(Value *V) {
744  ScheduleData *SD = ScheduleDataMap[V];
745  if (SD && SD->SchedulingRegionID == SchedulingRegionID)
746  return SD;
747  return nullptr;
748  }
749 
750  bool isInSchedulingRegion(ScheduleData *SD) {
751  return SD->SchedulingRegionID == SchedulingRegionID;
752  }
753 
754  /// Marks an instruction as scheduled and puts all dependent ready
755  /// instructions into the ready-list.
756  template <typename ReadyListType>
757  void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
758  SD->IsScheduled = true;
759  DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
760 
761  ScheduleData *BundleMember = SD;
762  while (BundleMember) {
763  // Handle the def-use chain dependencies.
764  for (Use &U : BundleMember->Inst->operands()) {
765  ScheduleData *OpDef = getScheduleData(U.get());
766  if (OpDef && OpDef->hasValidDependencies() &&
767  OpDef->incrementUnscheduledDeps(-1) == 0) {
768  // There are no more unscheduled dependencies after decrementing,
769  // so we can put the dependent instruction into the ready list.
770  ScheduleData *DepBundle = OpDef->FirstInBundle;
771  assert(!DepBundle->IsScheduled &&
772  "already scheduled bundle gets ready");
773  ReadyList.insert(DepBundle);
774  DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
775  }
776  }
777  // Handle the memory dependencies.
778  for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
779  if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
780  // There are no more unscheduled dependencies after decrementing,
781  // so we can put the dependent instruction into the ready list.
782  ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
783  assert(!DepBundle->IsScheduled &&
784  "already scheduled bundle gets ready");
785  ReadyList.insert(DepBundle);
786  DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
787  }
788  }
789  BundleMember = BundleMember->NextInBundle;
790  }
791  }
792 
793  /// Put all instructions into the ReadyList which are ready for scheduling.
794  template <typename ReadyListType>
795  void initialFillReadyList(ReadyListType &ReadyList) {
796  for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
797  ScheduleData *SD = getScheduleData(I);
798  if (SD->isSchedulingEntity() && SD->isReady()) {
799  ReadyList.insert(SD);
800  DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
801  }
802  }
803  }
804 
805  /// Checks if a bundle of instructions can be scheduled, i.e. has no
806  /// cyclic dependencies. This is only a dry-run, no instructions are
807  /// actually moved at this stage.
808  bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
809 
810  /// Un-bundles a group of instructions.
811  void cancelScheduling(ArrayRef<Value *> VL);
812 
813  /// Extends the scheduling region so that V is inside the region.
814  /// \returns true if the region size is within the limit.
815  bool extendSchedulingRegion(Value *V);
816 
817  /// Initialize the ScheduleData structures for new instructions in the
818  /// scheduling region.
819  void initScheduleData(Instruction *FromI, Instruction *ToI,
820  ScheduleData *PrevLoadStore,
821  ScheduleData *NextLoadStore);
822 
823  /// Updates the dependency information of a bundle and of all instructions/
824  /// bundles which depend on the original bundle.
825  void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
826  BoUpSLP *SLP);
827 
828  /// Sets all instruction in the scheduling region to un-scheduled.
829  void resetSchedule();
830 
831  BasicBlock *BB;
832 
833  /// Simple memory allocation for ScheduleData.
834  std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
835 
836  /// The size of a ScheduleData array in ScheduleDataChunks.
837  int ChunkSize;
838 
839  /// The allocator position in the current chunk, which is the last entry
840  /// of ScheduleDataChunks.
841  int ChunkPos;
842 
843  /// Attaches ScheduleData to Instruction.
844  /// Note that the mapping survives during all vectorization iterations, i.e.
845  /// ScheduleData structures are recycled.
846  DenseMap<Value *, ScheduleData *> ScheduleDataMap;
847 
848  struct ReadyList : SmallVector<ScheduleData *, 8> {
849  void insert(ScheduleData *SD) { push_back(SD); }
850  };
851 
852  /// The ready-list for scheduling (only used for the dry-run).
853  ReadyList ReadyInsts;
854 
855  /// The first instruction of the scheduling region.
856  Instruction *ScheduleStart;
857 
858  /// The first instruction _after_ the scheduling region.
859  Instruction *ScheduleEnd;
860 
861  /// The first memory accessing instruction in the scheduling region
862  /// (can be null).
863  ScheduleData *FirstLoadStoreInRegion;
864 
865  /// The last memory accessing instruction in the scheduling region
866  /// (can be null).
867  ScheduleData *LastLoadStoreInRegion;
868 
869  /// The current size of the scheduling region.
870  int ScheduleRegionSize;
871 
872  /// The maximum size allowed for the scheduling region.
873  int ScheduleRegionSizeLimit;
874 
875  /// The ID of the scheduling region. For a new vectorization iteration this
876  /// is incremented which "removes" all ScheduleData from the region.
877  int SchedulingRegionID;
878  };
879 
880  /// Attaches the BlockScheduling structures to basic blocks.
882 
883  /// Performs the "real" scheduling. Done before vectorization is actually
884  /// performed in a basic block.
885  void scheduleBlock(BlockScheduling *BS);
886 
887  /// List of users to ignore during scheduling and that don't need extracting.
888  ArrayRef<Value *> UserIgnoreList;
889 
890  // Number of load bundles that contain consecutive loads.
891  int NumLoadsWantToKeepOrder;
892 
893  // Number of load bundles that contain consecutive loads in reversed order.
894  int NumLoadsWantToChangeOrder;
895 
896  // Analysis and block reference.
897  Function *F;
898  ScalarEvolution *SE;
899  TargetTransformInfo *TTI;
900  TargetLibraryInfo *TLI;
901  AliasAnalysis *AA;
902  LoopInfo *LI;
903  DominatorTree *DT;
904  AssumptionCache *AC;
905  DemandedBits *DB;
906  const DataLayout *DL;
907  unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
908  unsigned MinVecRegSize; // Set by cl::opt (default: 128).
909  /// Instruction builder to construct the vectorized tree.
910  IRBuilder<> Builder;
911 
912  /// A map of scalar integer values to the smallest bit width with which they
913  /// can legally be represented. The values map to (width, signed) pairs,
914  /// where "width" indicates the minimum bit width and "signed" is True if the
915  /// value must be signed-extended, rather than zero-extended, back to its
916  /// original width.
918 };
919 
920 } // end namespace llvm
921 } // end namespace slpvectorizer
922 
924  ArrayRef<Value *> UserIgnoreLst) {
925  deleteTree();
926  UserIgnoreList = UserIgnoreLst;
927  if (!allSameType(Roots))
928  return;
929  buildTree_rec(Roots, 0);
930 
931  // Collect the values that we need to extract from the tree.
932  for (TreeEntry &EIdx : VectorizableTree) {
933  TreeEntry *Entry = &EIdx;
934 
935  // For each lane:
936  for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
937  Value *Scalar = Entry->Scalars[Lane];
938 
939  // No need to handle users of gathered values.
940  if (Entry->NeedToGather)
941  continue;
942 
943  for (User *U : Scalar->users()) {
944  DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
945 
946  Instruction *UserInst = dyn_cast<Instruction>(U);
947  if (!UserInst)
948  continue;
949 
950  // Skip in-tree scalars that become vectors
951  if (ScalarToTreeEntry.count(U)) {
952  int Idx = ScalarToTreeEntry[U];
953  TreeEntry *UseEntry = &VectorizableTree[Idx];
954  Value *UseScalar = UseEntry->Scalars[0];
955  // Some in-tree scalars will remain as scalar in vectorized
956  // instructions. If that is the case, the one in Lane 0 will
957  // be used.
958  if (UseScalar != U ||
959  !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
960  DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
961  << ".\n");
962  assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
963  continue;
964  }
965  }
966 
967  // Ignore users in the user ignore list.
968  if (is_contained(UserIgnoreList, UserInst))
969  continue;
970 
971  DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
972  Lane << " from " << *Scalar << ".\n");
973  ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
974  }
975  }
976  }
977 }
978 
979 
980 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
981  bool isAltShuffle = false;
982  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
983 
984  if (Depth == RecursionMaxDepth) {
985  DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
986  newTreeEntry(VL, false);
987  return;
988  }
989 
990  // Don't handle vectors.
991  if (VL[0]->getType()->isVectorTy()) {
992  DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
993  newTreeEntry(VL, false);
994  return;
995  }
996 
997  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
998  if (SI->getValueOperand()->getType()->isVectorTy()) {
999  DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
1000  newTreeEntry(VL, false);
1001  return;
1002  }
1003  unsigned Opcode = getSameOpcode(VL);
1004 
1005  // Check that this shuffle vector refers to the alternate
1006  // sequence of opcodes.
1007  if (Opcode == Instruction::ShuffleVector) {
1008  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
1009  unsigned Op = I0->getOpcode();
1010  if (Op != Instruction::ShuffleVector)
1011  isAltShuffle = true;
1012  }
1013 
1014  // If all of the operands are identical or constant we have a simple solution.
1015  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
1016  DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
1017  newTreeEntry(VL, false);
1018  return;
1019  }
1020 
1021  // We now know that this is a vector of instructions of the same type from
1022  // the same block.
1023 
1024  // Don't vectorize ephemeral values.
1025  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1026  if (EphValues.count(VL[i])) {
1027  DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1028  ") is ephemeral.\n");
1029  newTreeEntry(VL, false);
1030  return;
1031  }
1032  }
1033 
1034  // Check if this is a duplicate of another entry.
1035  if (ScalarToTreeEntry.count(VL[0])) {
1036  int Idx = ScalarToTreeEntry[VL[0]];
1037  TreeEntry *E = &VectorizableTree[Idx];
1038  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1039  DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1040  if (E->Scalars[i] != VL[i]) {
1041  DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1042  newTreeEntry(VL, false);
1043  return;
1044  }
1045  }
1046  DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1047  return;
1048  }
1049 
1050  // Check that none of the instructions in the bundle are already in the tree.
1051  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1052  if (ScalarToTreeEntry.count(VL[i])) {
1053  DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1054  ") is already in tree.\n");
1055  newTreeEntry(VL, false);
1056  return;
1057  }
1058  }
1059 
1060  // If any of the scalars is marked as a value that needs to stay scalar then
1061  // we need to gather the scalars.
1062  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1063  if (MustGather.count(VL[i])) {
1064  DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1065  newTreeEntry(VL, false);
1066  return;
1067  }
1068  }
1069 
1070  // Check that all of the users of the scalars that we want to vectorize are
1071  // schedulable.
1072  Instruction *VL0 = cast<Instruction>(VL[0]);
1073  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1074 
1075  if (!DT->isReachableFromEntry(BB)) {
1076  // Don't go into unreachable blocks. They may contain instructions with
1077  // dependency cycles which confuse the final scheduling.
1078  DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1079  newTreeEntry(VL, false);
1080  return;
1081  }
1082 
1083  // Check that every instructions appears once in this bundle.
1084  for (unsigned i = 0, e = VL.size(); i < e; ++i)
1085  for (unsigned j = i+1; j < e; ++j)
1086  if (VL[i] == VL[j]) {
1087  DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1088  newTreeEntry(VL, false);
1089  return;
1090  }
1091 
1092  auto &BSRef = BlocksSchedules[BB];
1093  if (!BSRef) {
1094  BSRef = llvm::make_unique<BlockScheduling>(BB);
1095  }
1096  BlockScheduling &BS = *BSRef.get();
1097 
1098  if (!BS.tryScheduleBundle(VL, this)) {
1099  DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1100  assert((!BS.getScheduleData(VL[0]) ||
1101  !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
1102  "tryScheduleBundle should cancelScheduling on failure");
1103  newTreeEntry(VL, false);
1104  return;
1105  }
1106  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1107 
1108  switch (Opcode) {
1109  case Instruction::PHI: {
1110  PHINode *PH = dyn_cast<PHINode>(VL0);
1111 
1112  // Check for terminator values (e.g. invoke).
1113  for (unsigned j = 0; j < VL.size(); ++j)
1114  for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1116  cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1117  if (Term) {
1118  DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1119  BS.cancelScheduling(VL);
1120  newTreeEntry(VL, false);
1121  return;
1122  }
1123  }
1124 
1125  newTreeEntry(VL, true);
1126  DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1127 
1128  for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1129  ValueList Operands;
1130  // Prepare the operand vector.
1131  for (Value *j : VL)
1132  Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
1133  PH->getIncomingBlock(i)));
1134 
1135  buildTree_rec(Operands, Depth + 1);
1136  }
1137  return;
1138  }
1139  case Instruction::ExtractValue:
1140  case Instruction::ExtractElement: {
1141  bool Reuse = canReuseExtract(VL, Opcode);
1142  if (Reuse) {
1143  DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1144  } else {
1145  BS.cancelScheduling(VL);
1146  }
1147  newTreeEntry(VL, Reuse);
1148  return;
1149  }
1150  case Instruction::Load: {
1151  // Check that a vectorized load would load the same memory as a scalar
1152  // load.
1153  // For example we don't want vectorize loads that are smaller than 8 bit.
1154  // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
1155  // loading/storing it as an i8 struct. If we vectorize loads/stores from
1156  // such a struct we read/write packed bits disagreeing with the
1157  // unvectorized version.
1158  Type *ScalarTy = VL[0]->getType();
1159 
1160  if (DL->getTypeSizeInBits(ScalarTy) !=
1161  DL->getTypeAllocSizeInBits(ScalarTy)) {
1162  BS.cancelScheduling(VL);
1163  newTreeEntry(VL, false);
1164  DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
1165  return;
1166  }
1167 
1168  // Make sure all loads in the bundle are simple - we can't vectorize
1169  // atomic or volatile loads.
1170  for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1171  LoadInst *L = cast<LoadInst>(VL[i]);
1172  if (!L->isSimple()) {
1173  BS.cancelScheduling(VL);
1174  newTreeEntry(VL, false);
1175  DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1176  return;
1177  }
1178  }
1179 
1180  // Check if the loads are consecutive, reversed, or neither.
1181  // TODO: What we really want is to sort the loads, but for now, check
1182  // the two likely directions.
1183  bool Consecutive = true;
1184  bool ReverseConsecutive = true;
1185  for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1186  if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1187  Consecutive = false;
1188  break;
1189  } else {
1190  ReverseConsecutive = false;
1191  }
1192  }
1193 
1194  if (Consecutive) {
1195  ++NumLoadsWantToKeepOrder;
1196  newTreeEntry(VL, true);
1197  DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1198  return;
1199  }
1200 
1201  // If none of the load pairs were consecutive when checked in order,
1202  // check the reverse order.
1203  if (ReverseConsecutive)
1204  for (unsigned i = VL.size() - 1; i > 0; --i)
1205  if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
1206  ReverseConsecutive = false;
1207  break;
1208  }
1209 
1210  BS.cancelScheduling(VL);
1211  newTreeEntry(VL, false);
1212 
1213  if (ReverseConsecutive) {
1214  ++NumLoadsWantToChangeOrder;
1215  DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
1216  } else {
1217  DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1218  }
1219  return;
1220  }
1221  case Instruction::ZExt:
1222  case Instruction::SExt:
1223  case Instruction::FPToUI:
1224  case Instruction::FPToSI:
1225  case Instruction::FPExt:
1226  case Instruction::PtrToInt:
1227  case Instruction::IntToPtr:
1228  case Instruction::SIToFP:
1229  case Instruction::UIToFP:
1230  case Instruction::Trunc:
1231  case Instruction::FPTrunc:
1232  case Instruction::BitCast: {
1233  Type *SrcTy = VL0->getOperand(0)->getType();
1234  for (unsigned i = 0; i < VL.size(); ++i) {
1235  Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1236  if (Ty != SrcTy || !isValidElementType(Ty)) {
1237  BS.cancelScheduling(VL);
1238  newTreeEntry(VL, false);
1239  DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1240  return;
1241  }
1242  }
1243  newTreeEntry(VL, true);
1244  DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1245 
1246  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1247  ValueList Operands;
1248  // Prepare the operand vector.
1249  for (Value *j : VL)
1250  Operands.push_back(cast<Instruction>(j)->getOperand(i));
1251 
1252  buildTree_rec(Operands, Depth+1);
1253  }
1254  return;
1255  }
1256  case Instruction::ICmp:
1257  case Instruction::FCmp: {
1258  // Check that all of the compares have the same predicate.
1259  CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1260  Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1261  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1262  CmpInst *Cmp = cast<CmpInst>(VL[i]);
1263  if (Cmp->getPredicate() != P0 ||
1264  Cmp->getOperand(0)->getType() != ComparedTy) {
1265  BS.cancelScheduling(VL);
1266  newTreeEntry(VL, false);
1267  DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1268  return;
1269  }
1270  }
1271 
1272  newTreeEntry(VL, true);
1273  DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1274 
1275  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1276  ValueList Operands;
1277  // Prepare the operand vector.
1278  for (Value *j : VL)
1279  Operands.push_back(cast<Instruction>(j)->getOperand(i));
1280 
1281  buildTree_rec(Operands, Depth+1);
1282  }
1283  return;
1284  }
1285  case Instruction::Select:
1286  case Instruction::Add:
1287  case Instruction::FAdd:
1288  case Instruction::Sub:
1289  case Instruction::FSub:
1290  case Instruction::Mul:
1291  case Instruction::FMul:
1292  case Instruction::UDiv:
1293  case Instruction::SDiv:
1294  case Instruction::FDiv:
1295  case Instruction::URem:
1296  case Instruction::SRem:
1297  case Instruction::FRem:
1298  case Instruction::Shl:
1299  case Instruction::LShr:
1300  case Instruction::AShr:
1301  case Instruction::And:
1302  case Instruction::Or:
1303  case Instruction::Xor: {
1304  newTreeEntry(VL, true);
1305  DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1306 
1307  // Sort operands of the instructions so that each side is more likely to
1308  // have the same opcode.
1309  if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1310  ValueList Left, Right;
1311  reorderInputsAccordingToOpcode(VL, Left, Right);
1312  buildTree_rec(Left, Depth + 1);
1313  buildTree_rec(Right, Depth + 1);
1314  return;
1315  }
1316 
1317  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1318  ValueList Operands;
1319  // Prepare the operand vector.
1320  for (Value *j : VL)
1321  Operands.push_back(cast<Instruction>(j)->getOperand(i));
1322 
1323  buildTree_rec(Operands, Depth+1);
1324  }
1325  return;
1326  }
1327  case Instruction::GetElementPtr: {
1328  // We don't combine GEPs with complicated (nested) indexing.
1329  for (unsigned j = 0; j < VL.size(); ++j) {
1330  if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1331  DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1332  BS.cancelScheduling(VL);
1333  newTreeEntry(VL, false);
1334  return;
1335  }
1336  }
1337 
1338  // We can't combine several GEPs into one vector if they operate on
1339  // different types.
1340  Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1341  for (unsigned j = 0; j < VL.size(); ++j) {
1342  Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1343  if (Ty0 != CurTy) {
1344  DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1345  BS.cancelScheduling(VL);
1346  newTreeEntry(VL, false);
1347  return;
1348  }
1349  }
1350 
1351  // We don't combine GEPs with non-constant indexes.
1352  for (unsigned j = 0; j < VL.size(); ++j) {
1353  auto Op = cast<Instruction>(VL[j])->getOperand(1);
1354  if (!isa<ConstantInt>(Op)) {
1355  DEBUG(
1356  dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1357  BS.cancelScheduling(VL);
1358  newTreeEntry(VL, false);
1359  return;
1360  }
1361  }
1362 
1363  newTreeEntry(VL, true);
1364  DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1365  for (unsigned i = 0, e = 2; i < e; ++i) {
1366  ValueList Operands;
1367  // Prepare the operand vector.
1368  for (Value *j : VL)
1369  Operands.push_back(cast<Instruction>(j)->getOperand(i));
1370 
1371  buildTree_rec(Operands, Depth + 1);
1372  }
1373  return;
1374  }
1375  case Instruction::Store: {
1376  // Check if the stores are consecutive or of we need to swizzle them.
1377  for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1378  if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1379  BS.cancelScheduling(VL);
1380  newTreeEntry(VL, false);
1381  DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1382  return;
1383  }
1384 
1385  newTreeEntry(VL, true);
1386  DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1387 
1388  ValueList Operands;
1389  for (Value *j : VL)
1390  Operands.push_back(cast<Instruction>(j)->getOperand(0));
1391 
1392  buildTree_rec(Operands, Depth + 1);
1393  return;
1394  }
1395  case Instruction::Call: {
1396  // Check if the calls are all to the same vectorizable intrinsic.
1397  CallInst *CI = cast<CallInst>(VL[0]);
1398  // Check if this is an Intrinsic call or something that can be
1399  // represented by an intrinsic call
1401  if (!isTriviallyVectorizable(ID)) {
1402  BS.cancelScheduling(VL);
1403  newTreeEntry(VL, false);
1404  DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1405  return;
1406  }
1407  Function *Int = CI->getCalledFunction();
1408  Value *A1I = nullptr;
1409  if (hasVectorInstrinsicScalarOpd(ID, 1))
1410  A1I = CI->getArgOperand(1);
1411  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1412  CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1413  if (!CI2 || CI2->getCalledFunction() != Int ||
1414  getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
1415  !CI->hasIdenticalOperandBundleSchema(*CI2)) {
1416  BS.cancelScheduling(VL);
1417  newTreeEntry(VL, false);
1418  DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1419  << "\n");
1420  return;
1421  }
1422  // ctlz,cttz and powi are special intrinsics whose second argument
1423  // should be same in order for them to be vectorized.
1424  if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1425  Value *A1J = CI2->getArgOperand(1);
1426  if (A1I != A1J) {
1427  BS.cancelScheduling(VL);
1428  newTreeEntry(VL, false);
1429  DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1430  << " argument "<< A1I<<"!=" << A1J
1431  << "\n");
1432  return;
1433  }
1434  }
1435  // Verify that the bundle operands are identical between the two calls.
1436  if (CI->hasOperandBundles() &&
1438  CI->op_begin() + CI->getBundleOperandsEndIndex(),
1439  CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
1440  BS.cancelScheduling(VL);
1441  newTreeEntry(VL, false);
1442  DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
1443  << *VL[i] << '\n');
1444  return;
1445  }
1446  }
1447 
1448  newTreeEntry(VL, true);
1449  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1450  ValueList Operands;
1451  // Prepare the operand vector.
1452  for (Value *j : VL) {
1453  CallInst *CI2 = dyn_cast<CallInst>(j);
1454  Operands.push_back(CI2->getArgOperand(i));
1455  }
1456  buildTree_rec(Operands, Depth + 1);
1457  }
1458  return;
1459  }
1460  case Instruction::ShuffleVector: {
1461  // If this is not an alternate sequence of opcode like add-sub
1462  // then do not vectorize this instruction.
1463  if (!isAltShuffle) {
1464  BS.cancelScheduling(VL);
1465  newTreeEntry(VL, false);
1466  DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1467  return;
1468  }
1469  newTreeEntry(VL, true);
1470  DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1471 
1472  // Reorder operands if reordering would enable vectorization.
1473  if (isa<BinaryOperator>(VL0)) {
1474  ValueList Left, Right;
1475  reorderAltShuffleOperands(VL, Left, Right);
1476  buildTree_rec(Left, Depth + 1);
1477  buildTree_rec(Right, Depth + 1);
1478  return;
1479  }
1480 
1481  for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1482  ValueList Operands;
1483  // Prepare the operand vector.
1484  for (Value *j : VL)
1485  Operands.push_back(cast<Instruction>(j)->getOperand(i));
1486 
1487  buildTree_rec(Operands, Depth + 1);
1488  }
1489  return;
1490  }
1491  default:
1492  BS.cancelScheduling(VL);
1493  newTreeEntry(VL, false);
1494  DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1495  return;
1496  }
1497 }
1498 
1499 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
1500  unsigned N;
1501  Type *EltTy;
1502  auto *ST = dyn_cast<StructType>(T);
1503  if (ST) {
1504  N = ST->getNumElements();
1505  EltTy = *ST->element_begin();
1506  } else {
1507  N = cast<ArrayType>(T)->getNumElements();
1508  EltTy = cast<ArrayType>(T)->getElementType();
1509  }
1510  if (!isValidElementType(EltTy))
1511  return 0;
1512  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
1513  if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
1514  return 0;
1515  if (ST) {
1516  // Check that struct is homogeneous.
1517  for (const auto *Ty : ST->elements())
1518  if (Ty != EltTy)
1519  return 0;
1520  }
1521  return N;
1522 }
1523 
1524 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
1525  assert(Opcode == Instruction::ExtractElement ||
1526  Opcode == Instruction::ExtractValue);
1527  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
1528  // Check if all of the extracts come from the same vector and from the
1529  // correct offset.
1530  Value *VL0 = VL[0];
1531  Instruction *E0 = cast<Instruction>(VL0);
1532  Value *Vec = E0->getOperand(0);
1533 
1534  // We have to extract from a vector/aggregate with the same number of elements.
1535  unsigned NElts;
1536  if (Opcode == Instruction::ExtractValue) {
1537  const DataLayout &DL = E0->getModule()->getDataLayout();
1538  NElts = canMapToVector(Vec->getType(), DL);
1539  if (!NElts)
1540  return false;
1541  // Check if load can be rewritten as load of vector.
1542  LoadInst *LI = dyn_cast<LoadInst>(Vec);
1543  if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
1544  return false;
1545  } else {
1546  NElts = Vec->getType()->getVectorNumElements();
1547  }
1548 
1549  if (NElts != VL.size())
1550  return false;
1551 
1552  // Check that all of the indices extract from the correct offset.
1553  if (!matchExtractIndex(E0, 0, Opcode))
1554  return false;
1555 
1556  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1557  Instruction *E = cast<Instruction>(VL[i]);
1558  if (!matchExtractIndex(E, i, Opcode))
1559  return false;
1560  if (E->getOperand(0) != Vec)
1561  return false;
1562  }
1563 
1564  return true;
1565 }
1566 
1567 int BoUpSLP::getEntryCost(TreeEntry *E) {
1568  ArrayRef<Value*> VL = E->Scalars;
1569 
1570  Type *ScalarTy = VL[0]->getType();
1571  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1572  ScalarTy = SI->getValueOperand()->getType();
1573  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1574 
1575  // If we have computed a smaller type for the expression, update VecTy so
1576  // that the costs will be accurate.
1577  if (MinBWs.count(VL[0]))
1578  VecTy = VectorType::get(
1579  IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
1580 
1581  if (E->NeedToGather) {
1582  if (allConstant(VL))
1583  return 0;
1584  if (isSplat(VL)) {
1585  return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1586  }
1587  return getGatherCost(E->Scalars);
1588  }
1589  unsigned Opcode = getSameOpcode(VL);
1590  assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
1591  Instruction *VL0 = cast<Instruction>(VL[0]);
1592  switch (Opcode) {
1593  case Instruction::PHI: {
1594  return 0;
1595  }
1596  case Instruction::ExtractValue:
1597  case Instruction::ExtractElement: {
1598  if (canReuseExtract(VL, Opcode)) {
1599  int DeadCost = 0;
1600  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1601  Instruction *E = cast<Instruction>(VL[i]);
1602  if (E->hasOneUse())
1603  // Take credit for instruction that will become dead.
1604  DeadCost +=
1605  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1606  }
1607  return -DeadCost;
1608  }
1609  return getGatherCost(VecTy);
1610  }
1611  case Instruction::ZExt:
1612  case Instruction::SExt:
1613  case Instruction::FPToUI:
1614  case Instruction::FPToSI:
1615  case Instruction::FPExt:
1616  case Instruction::PtrToInt:
1617  case Instruction::IntToPtr:
1618  case Instruction::SIToFP:
1619  case Instruction::UIToFP:
1620  case Instruction::Trunc:
1621  case Instruction::FPTrunc:
1622  case Instruction::BitCast: {
1623  Type *SrcTy = VL0->getOperand(0)->getType();
1624 
1625  // Calculate the cost of this instruction.
1626  int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1627  VL0->getType(), SrcTy);
1628 
1629  VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1630  int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1631  return VecCost - ScalarCost;
1632  }
1633  case Instruction::FCmp:
1634  case Instruction::ICmp:
1635  case Instruction::Select: {
1636  // Calculate the cost of this instruction.
1637  VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1638  int ScalarCost = VecTy->getNumElements() *
1639  TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1640  int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1641  return VecCost - ScalarCost;
1642  }
1643  case Instruction::Add:
1644  case Instruction::FAdd:
1645  case Instruction::Sub:
1646  case Instruction::FSub:
1647  case Instruction::Mul:
1648  case Instruction::FMul:
1649  case Instruction::UDiv:
1650  case Instruction::SDiv:
1651  case Instruction::FDiv:
1652  case Instruction::URem:
1653  case Instruction::SRem:
1654  case Instruction::FRem:
1655  case Instruction::Shl:
1656  case Instruction::LShr:
1657  case Instruction::AShr:
1658  case Instruction::And:
1659  case Instruction::Or:
1660  case Instruction::Xor: {
1661  // Certain instructions can be cheaper to vectorize if they have a
1662  // constant second vector operand.
1671 
1672  // If all operands are exactly the same ConstantInt then set the
1673  // operand kind to OK_UniformConstantValue.
1674  // If instead not all operands are constants, then set the operand kind
1675  // to OK_AnyValue. If all operands are constants but not the same,
1676  // then set the operand kind to OK_NonUniformConstantValue.
1677  ConstantInt *CInt = nullptr;
1678  for (unsigned i = 0; i < VL.size(); ++i) {
1679  const Instruction *I = cast<Instruction>(VL[i]);
1680  if (!isa<ConstantInt>(I->getOperand(1))) {
1682  break;
1683  }
1684  if (i == 0) {
1685  CInt = cast<ConstantInt>(I->getOperand(1));
1686  continue;
1687  }
1689  CInt != cast<ConstantInt>(I->getOperand(1)))
1691  }
1692  // FIXME: Currently cost of model modification for division by power of
1693  // 2 is handled for X86 and AArch64. Add support for other targets.
1694  if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1695  CInt->getValue().isPowerOf2())
1697 
1698  int ScalarCost = VecTy->getNumElements() *
1699  TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
1700  Op2VK, Op1VP, Op2VP);
1701  int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1702  Op1VP, Op2VP);
1703  return VecCost - ScalarCost;
1704  }
1705  case Instruction::GetElementPtr: {
1710 
1711  int ScalarCost =
1712  VecTy->getNumElements() *
1713  TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1714  int VecCost =
1715  TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1716 
1717  return VecCost - ScalarCost;
1718  }
1719  case Instruction::Load: {
1720  // Cost of wide load - cost of scalar loads.
1721  unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
1722  int ScalarLdCost = VecTy->getNumElements() *
1723  TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
1724  int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
1725  VecTy, alignment, 0);
1726  return VecLdCost - ScalarLdCost;
1727  }
1728  case Instruction::Store: {
1729  // We know that we can merge the stores. Calculate the cost.
1730  unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
1731  int ScalarStCost = VecTy->getNumElements() *
1732  TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
1733  int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
1734  VecTy, alignment, 0);
1735  return VecStCost - ScalarStCost;
1736  }
1737  case Instruction::Call: {
1738  CallInst *CI = cast<CallInst>(VL0);
1740 
1741  // Calculate the cost of the scalar and vector calls.
1742  SmallVector<Type*, 4> ScalarTys, VecTys;
1743  for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1744  ScalarTys.push_back(CI->getArgOperand(op)->getType());
1746  VecTy->getNumElements()));
1747  }
1748 
1749  FastMathFlags FMF;
1750  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
1751  FMF = FPMO->getFastMathFlags();
1752 
1753  int ScalarCallCost = VecTy->getNumElements() *
1754  TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
1755 
1756  int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
1757 
1758  DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1759  << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1760  << " for " << *CI << "\n");
1761 
1762  return VecCallCost - ScalarCallCost;
1763  }
1764  case Instruction::ShuffleVector: {
1769  int ScalarCost = 0;
1770  int VecCost = 0;
1771  for (Value *i : VL) {
1772  Instruction *I = cast<Instruction>(i);
1773  if (!I)
1774  break;
1775  ScalarCost +=
1776  TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1777  }
1778  // VecCost is equal to sum of the cost of creating 2 vectors
1779  // and the cost of creating shuffle.
1780  Instruction *I0 = cast<Instruction>(VL[0]);
1781  VecCost =
1782  TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1783  Instruction *I1 = cast<Instruction>(VL[1]);
1784  VecCost +=
1785  TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1786  VecCost +=
1788  return VecCost - ScalarCost;
1789  }
1790  default:
1791  llvm_unreachable("Unknown instruction");
1792  }
1793 }
1794 
1795 bool BoUpSLP::isFullyVectorizableTinyTree() {
1796  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1797  VectorizableTree.size() << " is fully vectorizable .\n");
1798 
1799  // We only handle trees of heights 1 and 2.
1800  if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
1801  return true;
1802 
1803  if (VectorizableTree.size() != 2)
1804  return false;
1805 
1806  // Handle splat and all-constants stores.
1807  if (!VectorizableTree[0].NeedToGather &&
1808  (allConstant(VectorizableTree[1].Scalars) ||
1809  isSplat(VectorizableTree[1].Scalars)))
1810  return true;
1811 
1812  // Gathering cost would be too much for tiny trees.
1813  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1814  return false;
1815 
1816  return true;
1817 }
1818 
1820 
1821  // We can vectorize the tree if its size is greater than or equal to the
1822  // minimum size specified by the MinTreeSize command line option.
1823  if (VectorizableTree.size() >= MinTreeSize)
1824  return false;
1825 
1826  // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
1827  // can vectorize it if we can prove it fully vectorizable.
1828  if (isFullyVectorizableTinyTree())
1829  return false;
1830 
1831  assert(VectorizableTree.empty()
1832  ? ExternalUses.empty()
1833  : true && "We shouldn't have any external users");
1834 
1835  // Otherwise, we can't vectorize the tree. It is both tiny and not fully
1836  // vectorizable.
1837  return true;
1838 }
1839 
1841  // Walk from the bottom of the tree to the top, tracking which values are
1842  // live. When we see a call instruction that is not part of our tree,
1843  // query TTI to see if there is a cost to keeping values live over it
1844  // (for example, if spills and fills are required).
1845  unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1846  int Cost = 0;
1847 
1848  SmallPtrSet<Instruction*, 4> LiveValues;
1849  Instruction *PrevInst = nullptr;
1850 
1851  for (const auto &N : VectorizableTree) {
1852  Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
1853  if (!Inst)
1854  continue;
1855 
1856  if (!PrevInst) {
1857  PrevInst = Inst;
1858  continue;
1859  }
1860 
1861  // Update LiveValues.
1862  LiveValues.erase(PrevInst);
1863  for (auto &J : PrevInst->operands()) {
1864  if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1865  LiveValues.insert(cast<Instruction>(&*J));
1866  }
1867 
1868  DEBUG(
1869  dbgs() << "SLP: #LV: " << LiveValues.size();
1870  for (auto *X : LiveValues)
1871  dbgs() << " " << X->getName();
1872  dbgs() << ", Looking at ";
1873  Inst->dump();
1874  );
1875 
1876  // Now find the sequence of instructions between PrevInst and Inst.
1878  PrevInstIt =
1879  PrevInst->getIterator().getReverse();
1880  while (InstIt != PrevInstIt) {
1881  if (PrevInstIt == PrevInst->getParent()->rend()) {
1882  PrevInstIt = Inst->getParent()->rbegin();
1883  continue;
1884  }
1885 
1886  if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1888  for (auto *II : LiveValues)
1889  V.push_back(VectorType::get(II->getType(), BundleWidth));
1890  Cost += TTI->getCostOfKeepingLiveOverCall(V);
1891  }
1892 
1893  ++PrevInstIt;
1894  }
1895 
1896  PrevInst = Inst;
1897  }
1898 
1899  return Cost;
1900 }
1901 
1903  int Cost = 0;
1904  DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1905  VectorizableTree.size() << ".\n");
1906 
1907  unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1908 
1909  for (TreeEntry &TE : VectorizableTree) {
1910  int C = getEntryCost(&TE);
1911  DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1912  << *TE.Scalars[0] << ".\n");
1913  Cost += C;
1914  }
1915 
1916  SmallSet<Value *, 16> ExtractCostCalculated;
1917  int ExtractCost = 0;
1918  for (ExternalUser &EU : ExternalUses) {
1919  // We only add extract cost once for the same scalar.
1920  if (!ExtractCostCalculated.insert(EU.Scalar).second)
1921  continue;
1922 
1923  // Uses by ephemeral values are free (because the ephemeral value will be
1924  // removed prior to code generation, and so the extraction will be
1925  // removed as well).
1926  if (EphValues.count(EU.User))
1927  continue;
1928 
1929  // If we plan to rewrite the tree in a smaller type, we will need to sign
1930  // extend the extracted value back to the original type. Here, we account
1931  // for the extract and the added cost of the sign extend if needed.
1932  auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
1933  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
1934  if (MinBWs.count(ScalarRoot)) {
1935  auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
1936  auto Extend =
1937  MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
1938  VecTy = VectorType::get(MinTy, BundleWidth);
1939  ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
1940  VecTy, EU.Lane);
1941  } else {
1942  ExtractCost +=
1943  TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
1944  }
1945  }
1946 
1947  int SpillCost = getSpillCost();
1948  Cost += SpillCost + ExtractCost;
1949 
1950  DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
1951  << "SLP: Extract Cost = " << ExtractCost << ".\n"
1952  << "SLP: Total Cost = " << Cost << ".\n");
1953  return Cost;
1954 }
1955 
1956 int BoUpSLP::getGatherCost(Type *Ty) {
1957  int Cost = 0;
1958  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1959  Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1960  return Cost;
1961 }
1962 
1963 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1964  // Find the type of the operands in VL.
1965  Type *ScalarTy = VL[0]->getType();
1966  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1967  ScalarTy = SI->getValueOperand()->getType();
1968  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1969  // Find the cost of inserting/extracting values from the vector.
1970  return getGatherCost(VecTy);
1971 }
1972 
1973 // Reorder commutative operations in alternate shuffle if the resulting vectors
1974 // are consecutive loads. This would allow us to vectorize the tree.
1975 // If we have something like-
1976 // load a[0] - load b[0]
1977 // load b[1] + load a[1]
1978 // load a[2] - load b[2]
1979 // load a[3] + load b[3]
1980 // Reordering the second load b[1] load a[1] would allow us to vectorize this
1981 // code.
1982 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
1984  SmallVectorImpl<Value *> &Right) {
1985  // Push left and right operands of binary operation into Left and Right
1986  for (Value *i : VL) {
1987  Left.push_back(cast<Instruction>(i)->getOperand(0));
1988  Right.push_back(cast<Instruction>(i)->getOperand(1));
1989  }
1990 
1991  // Reorder if we have a commutative operation and consecutive access
1992  // are on either side of the alternate instructions.
1993  for (unsigned j = 0; j < VL.size() - 1; ++j) {
1994  if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1995  if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1996  Instruction *VL1 = cast<Instruction>(VL[j]);
1997  Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1998  if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
1999  std::swap(Left[j], Right[j]);
2000  continue;
2001  } else if (VL2->isCommutative() &&
2002  isConsecutiveAccess(L, L1, *DL, *SE)) {
2003  std::swap(Left[j + 1], Right[j + 1]);
2004  continue;
2005  }
2006  // else unchanged
2007  }
2008  }
2009  if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2010  if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2011  Instruction *VL1 = cast<Instruction>(VL[j]);
2012  Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2013  if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2014  std::swap(Left[j], Right[j]);
2015  continue;
2016  } else if (VL2->isCommutative() &&
2017  isConsecutiveAccess(L, L1, *DL, *SE)) {
2018  std::swap(Left[j + 1], Right[j + 1]);
2019  continue;
2020  }
2021  // else unchanged
2022  }
2023  }
2024  }
2025 }
2026 
2027 // Return true if I should be commuted before adding it's left and right
2028 // operands to the arrays Left and Right.
2029 //
2030 // The vectorizer is trying to either have all elements one side being
2031 // instruction with the same opcode to enable further vectorization, or having
2032 // a splat to lower the vectorizing cost.
2033 static bool shouldReorderOperands(int i, Instruction &I,
2035  SmallVectorImpl<Value *> &Right,
2036  bool AllSameOpcodeLeft,
2037  bool AllSameOpcodeRight, bool SplatLeft,
2038  bool SplatRight) {
2039  Value *VLeft = I.getOperand(0);
2040  Value *VRight = I.getOperand(1);
2041  // If we have "SplatRight", try to see if commuting is needed to preserve it.
2042  if (SplatRight) {
2043  if (VRight == Right[i - 1])
2044  // Preserve SplatRight
2045  return false;
2046  if (VLeft == Right[i - 1]) {
2047  // Commuting would preserve SplatRight, but we don't want to break
2048  // SplatLeft either, i.e. preserve the original order if possible.
2049  // (FIXME: why do we care?)
2050  if (SplatLeft && VLeft == Left[i - 1])
2051  return false;
2052  return true;
2053  }
2054  }
2055  // Symmetrically handle Right side.
2056  if (SplatLeft) {
2057  if (VLeft == Left[i - 1])
2058  // Preserve SplatLeft
2059  return false;
2060  if (VRight == Left[i - 1])
2061  return true;
2062  }
2063 
2064  Instruction *ILeft = dyn_cast<Instruction>(VLeft);
2065  Instruction *IRight = dyn_cast<Instruction>(VRight);
2066 
2067  // If we have "AllSameOpcodeRight", try to see if the left operands preserves
2068  // it and not the right, in this case we want to commute.
2069  if (AllSameOpcodeRight) {
2070  unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
2071  if (IRight && RightPrevOpcode == IRight->getOpcode())
2072  // Do not commute, a match on the right preserves AllSameOpcodeRight
2073  return false;
2074  if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
2075  // We have a match and may want to commute, but first check if there is
2076  // not also a match on the existing operands on the Left to preserve
2077  // AllSameOpcodeLeft, i.e. preserve the original order if possible.
2078  // (FIXME: why do we care?)
2079  if (AllSameOpcodeLeft && ILeft &&
2080  cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
2081  return false;
2082  return true;
2083  }
2084  }
2085  // Symmetrically handle Left side.
2086  if (AllSameOpcodeLeft) {
2087  unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
2088  if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
2089  return false;
2090  if (IRight && LeftPrevOpcode == IRight->getOpcode())
2091  return true;
2092  }
2093  return false;
2094 }
2095 
2096 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2098  SmallVectorImpl<Value *> &Right) {
2099 
2100  if (VL.size()) {
2101  // Peel the first iteration out of the loop since there's nothing
2102  // interesting to do anyway and it simplifies the checks in the loop.
2103  auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
2104  auto VRight = cast<Instruction>(VL[0])->getOperand(1);
2105  if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
2106  // Favor having instruction to the right. FIXME: why?
2107  std::swap(VLeft, VRight);
2108  Left.push_back(VLeft);
2109  Right.push_back(VRight);
2110  }
2111 
2112  // Keep track if we have instructions with all the same opcode on one side.
2113  bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
2114  bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
2115  // Keep track if we have one side with all the same value (broadcast).
2116  bool SplatLeft = true;
2117  bool SplatRight = true;
2118 
2119  for (unsigned i = 1, e = VL.size(); i != e; ++i) {
2120  Instruction *I = cast<Instruction>(VL[i]);
2121  assert(I->isCommutative() && "Can only process commutative instruction");
2122  // Commute to favor either a splat or maximizing having the same opcodes on
2123  // one side.
2124  if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
2125  AllSameOpcodeRight, SplatLeft, SplatRight)) {
2126  Left.push_back(I->getOperand(1));
2127  Right.push_back(I->getOperand(0));
2128  } else {
2129  Left.push_back(I->getOperand(0));
2130  Right.push_back(I->getOperand(1));
2131  }
2132  // Update Splat* and AllSameOpcode* after the insertion.
2133  SplatRight = SplatRight && (Right[i - 1] == Right[i]);
2134  SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
2135  AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
2136  (cast<Instruction>(Left[i - 1])->getOpcode() ==
2137  cast<Instruction>(Left[i])->getOpcode());
2138  AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
2139  (cast<Instruction>(Right[i - 1])->getOpcode() ==
2140  cast<Instruction>(Right[i])->getOpcode());
2141  }
2142 
2143  // If one operand end up being broadcast, return this operand order.
2144  if (SplatRight || SplatLeft)
2145  return;
2146 
2147  // Finally check if we can get longer vectorizable chain by reordering
2148  // without breaking the good operand order detected above.
2149  // E.g. If we have something like-
2150  // load a[0] load b[0]
2151  // load b[1] load a[1]
2152  // load a[2] load b[2]
2153  // load a[3] load b[3]
2154  // Reordering the second load b[1] load a[1] would allow us to vectorize
2155  // this code and we still retain AllSameOpcode property.
2156  // FIXME: This load reordering might break AllSameOpcode in some rare cases
2157  // such as-
2158  // add a[0],c[0] load b[0]
2159  // add a[1],c[2] load b[1]
2160  // b[2] load b[2]
2161  // add a[3],c[3] load b[3]
2162  for (unsigned j = 0; j < VL.size() - 1; ++j) {
2163  if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2164  if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2165  if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2166  std::swap(Left[j + 1], Right[j + 1]);
2167  continue;
2168  }
2169  }
2170  }
2171  if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2172  if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2173  if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2174  std::swap(Left[j + 1], Right[j + 1]);
2175  continue;
2176  }
2177  }
2178  }
2179  // else unchanged
2180  }
2181 }
2182 
2183 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2184 
2185  // Get the basic block this bundle is in. All instructions in the bundle
2186  // should be in this block.
2187  auto *Front = cast<Instruction>(VL.front());
2188  auto *BB = Front->getParent();
2189  assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
2190  return cast<Instruction>(V)->getParent() == BB;
2191  }));
2192 
2193  // The last instruction in the bundle in program order.
2194  Instruction *LastInst = nullptr;
2195 
2196  // Find the last instruction. The common case should be that BB has been
2197  // scheduled, and the last instruction is VL.back(). So we start with
2198  // VL.back() and iterate over schedule data until we reach the end of the
2199  // bundle. The end of the bundle is marked by null ScheduleData.
2200  if (BlocksSchedules.count(BB)) {
2201  auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
2202  if (Bundle && Bundle->isPartOfBundle())
2203  for (; Bundle; Bundle = Bundle->NextInBundle)
2204  LastInst = Bundle->Inst;
2205  }
2206 
2207  // LastInst can still be null at this point if there's either not an entry
2208  // for BB in BlocksSchedules or there's no ScheduleData available for
2209  // VL.back(). This can be the case if buildTree_rec aborts for various
2210  // reasons (e.g., the maximum recursion depth is reached, the maximum region
2211  // size is reached, etc.). ScheduleData is initialized in the scheduling
2212  // "dry-run".
2213  //
2214  // If this happens, we can still find the last instruction by brute force. We
2215  // iterate forwards from Front (inclusive) until we either see all
2216  // instructions in the bundle or reach the end of the block. If Front is the
2217  // last instruction in program order, LastInst will be set to Front, and we
2218  // will visit all the remaining instructions in the block.
2219  //
2220  // One of the reasons we exit early from buildTree_rec is to place an upper
2221  // bound on compile-time. Thus, taking an additional compile-time hit here is
2222  // not ideal. However, this should be exceedingly rare since it requires that
2223  // we both exit early from buildTree_rec and that the bundle be out-of-order
2224  // (causing us to iterate all the way to the end of the block).
2225  if (!LastInst) {
2226  SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
2227  for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
2228  if (Bundle.erase(&I))
2229  LastInst = &I;
2230  if (Bundle.empty())
2231  break;
2232  }
2233  }
2234 
2235  // Set the insertion point after the last instruction in the bundle. Set the
2236  // debug location to Front.
2237  Builder.SetInsertPoint(BB, ++LastInst->getIterator());
2238  Builder.SetCurrentDebugLocation(Front->getDebugLoc());
2239 }
2240 
2241 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2242  Value *Vec = UndefValue::get(Ty);
2243  // Generate the 'InsertElement' instruction.
2244  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2245  Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2246  if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2247  GatherSeq.insert(Insrt);
2248  CSEBlocks.insert(Insrt->getParent());
2249 
2250  // Add to our 'need-to-extract' list.
2251  if (ScalarToTreeEntry.count(VL[i])) {
2252  int Idx = ScalarToTreeEntry[VL[i]];
2253  TreeEntry *E = &VectorizableTree[Idx];
2254  // Find which lane we need to extract.
2255  int FoundLane = -1;
2256  for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2257  // Is this the lane of the scalar that we are looking for ?
2258  if (E->Scalars[Lane] == VL[i]) {
2259  FoundLane = Lane;
2260  break;
2261  }
2262  }
2263  assert(FoundLane >= 0 && "Could not find the correct lane");
2264  ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2265  }
2266  }
2267  }
2268 
2269  return Vec;
2270 }
2271 
2272 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2274  = ScalarToTreeEntry.find(VL[0]);
2275  if (Entry != ScalarToTreeEntry.end()) {
2276  int Idx = Entry->second;
2277  const TreeEntry *En = &VectorizableTree[Idx];
2278  if (En->isSame(VL) && En->VectorizedValue)
2279  return En->VectorizedValue;
2280  }
2281  return nullptr;
2282 }
2283 
2285  if (ScalarToTreeEntry.count(VL[0])) {
2286  int Idx = ScalarToTreeEntry[VL[0]];
2287  TreeEntry *E = &VectorizableTree[Idx];
2288  if (E->isSame(VL))
2289  return vectorizeTree(E);
2290  }
2291 
2292  Type *ScalarTy = VL[0]->getType();
2293  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2294  ScalarTy = SI->getValueOperand()->getType();
2295  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2296 
2297  return Gather(VL, VecTy);
2298 }
2299 
2300 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2301  IRBuilder<>::InsertPointGuard Guard(Builder);
2302 
2303  if (E->VectorizedValue) {
2304  DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2305  return E->VectorizedValue;
2306  }
2307 
2308  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2309  Type *ScalarTy = VL0->getType();
2310  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2311  ScalarTy = SI->getValueOperand()->getType();
2312  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2313 
2314  if (E->NeedToGather) {
2315  setInsertPointAfterBundle(E->Scalars);
2316  auto *V = Gather(E->Scalars, VecTy);
2317  E->VectorizedValue = V;
2318  return V;
2319  }
2320 
2321  unsigned Opcode = getSameOpcode(E->Scalars);
2322 
2323  switch (Opcode) {
2324  case Instruction::PHI: {
2325  PHINode *PH = dyn_cast<PHINode>(VL0);
2326  Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2327  Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2328  PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2329  E->VectorizedValue = NewPhi;
2330 
2331  // PHINodes may have multiple entries from the same block. We want to
2332  // visit every block once.
2333  SmallSet<BasicBlock*, 4> VisitedBBs;
2334 
2335  for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2336  ValueList Operands;
2337  BasicBlock *IBB = PH->getIncomingBlock(i);
2338 
2339  if (!VisitedBBs.insert(IBB).second) {
2340  NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2341  continue;
2342  }
2343 
2344  // Prepare the operand vector.
2345  for (Value *V : E->Scalars)
2346  Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2347 
2348  Builder.SetInsertPoint(IBB->getTerminator());
2349  Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2350  Value *Vec = vectorizeTree(Operands);
2351  NewPhi->addIncoming(Vec, IBB);
2352  }
2353 
2354  assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2355  "Invalid number of incoming values");
2356  return NewPhi;
2357  }
2358 
2359  case Instruction::ExtractElement: {
2360  if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
2361  Value *V = VL0->getOperand(0);
2362  E->VectorizedValue = V;
2363  return V;
2364  }
2365  setInsertPointAfterBundle(E->Scalars);
2366  auto *V = Gather(E->Scalars, VecTy);
2367  E->VectorizedValue = V;
2368  return V;
2369  }
2370  case Instruction::ExtractValue: {
2371  if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
2372  LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
2373  Builder.SetInsertPoint(LI);
2374  PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
2375  Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
2376  LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
2377  E->VectorizedValue = V;
2378  return propagateMetadata(V, E->Scalars);
2379  }
2380  setInsertPointAfterBundle(E->Scalars);
2381  auto *V = Gather(E->Scalars, VecTy);
2382  E->VectorizedValue = V;
2383  return V;
2384  }
2385  case Instruction::ZExt:
2386  case Instruction::SExt:
2387  case Instruction::FPToUI:
2388  case Instruction::FPToSI:
2389  case Instruction::FPExt:
2390  case Instruction::PtrToInt:
2391  case Instruction::IntToPtr:
2392  case Instruction::SIToFP:
2393  case Instruction::UIToFP:
2394  case Instruction::Trunc:
2395  case Instruction::FPTrunc:
2396  case Instruction::BitCast: {
2397  ValueList INVL;
2398  for (Value *V : E->Scalars)
2399  INVL.push_back(cast<Instruction>(V)->getOperand(0));
2400 
2401  setInsertPointAfterBundle(E->Scalars);
2402 
2403  Value *InVec = vectorizeTree(INVL);
2404 
2405  if (Value *V = alreadyVectorized(E->Scalars))
2406  return V;
2407 
2408  CastInst *CI = dyn_cast<CastInst>(VL0);
2409  Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2410  E->VectorizedValue = V;
2411  ++NumVectorInstructions;
2412  return V;
2413  }
2414  case Instruction::FCmp:
2415  case Instruction::ICmp: {
2416  ValueList LHSV, RHSV;
2417  for (Value *V : E->Scalars) {
2418  LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2419  RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2420  }
2421 
2422  setInsertPointAfterBundle(E->Scalars);
2423 
2424  Value *L = vectorizeTree(LHSV);
2425  Value *R = vectorizeTree(RHSV);
2426 
2427  if (Value *V = alreadyVectorized(E->Scalars))
2428  return V;
2429 
2430  CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2431  Value *V;
2432  if (Opcode == Instruction::FCmp)
2433  V = Builder.CreateFCmp(P0, L, R);
2434  else
2435  V = Builder.CreateICmp(P0, L, R);
2436 
2437  E->VectorizedValue = V;
2438  propagateIRFlags(E->VectorizedValue, E->Scalars);
2439  ++NumVectorInstructions;
2440  return V;
2441  }
2442  case Instruction::Select: {
2443  ValueList TrueVec, FalseVec, CondVec;
2444  for (Value *V : E->Scalars) {
2445  CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2446  TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2447  FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2448  }
2449 
2450  setInsertPointAfterBundle(E->Scalars);
2451 
2452  Value *Cond = vectorizeTree(CondVec);
2453  Value *True = vectorizeTree(TrueVec);
2454  Value *False = vectorizeTree(FalseVec);
2455 
2456  if (Value *V = alreadyVectorized(E->Scalars))
2457  return V;
2458 
2459  Value *V = Builder.CreateSelect(Cond, True, False);
2460  E->VectorizedValue = V;
2461  ++NumVectorInstructions;
2462  return V;
2463  }
2464  case Instruction::Add:
2465  case Instruction::FAdd:
2466  case Instruction::Sub:
2467  case Instruction::FSub:
2468  case Instruction::Mul:
2469  case Instruction::FMul:
2470  case Instruction::UDiv:
2471  case Instruction::SDiv:
2472  case Instruction::FDiv:
2473  case Instruction::URem:
2474  case Instruction::SRem:
2475  case Instruction::FRem:
2476  case Instruction::Shl:
2477  case Instruction::LShr:
2478  case Instruction::AShr:
2479  case Instruction::And:
2480  case Instruction::Or:
2481  case Instruction::Xor: {
2482  ValueList LHSVL, RHSVL;
2483  if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2484  reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2485  else
2486  for (Value *V : E->Scalars) {
2487  LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2488  RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2489  }
2490 
2491  setInsertPointAfterBundle(E->Scalars);
2492 
2493  Value *LHS = vectorizeTree(LHSVL);
2494  Value *RHS = vectorizeTree(RHSVL);
2495 
2496  if (Value *V = alreadyVectorized(E->Scalars))
2497  return V;
2498 
2499  BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2500  Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2501  E->VectorizedValue = V;
2502  propagateIRFlags(E->VectorizedValue, E->Scalars);
2503  ++NumVectorInstructions;
2504 
2505  if (Instruction *I = dyn_cast<Instruction>(V))
2506  return propagateMetadata(I, E->Scalars);
2507 
2508  return V;
2509  }
2510  case Instruction::Load: {
2511  // Loads are inserted at the head of the tree because we don't want to
2512  // sink them all the way down past store instructions.
2513  setInsertPointAfterBundle(E->Scalars);
2514 
2515  LoadInst *LI = cast<LoadInst>(VL0);
2516  Type *ScalarLoadTy = LI->getType();
2517  unsigned AS = LI->getPointerAddressSpace();
2518 
2519  Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2520  VecTy->getPointerTo(AS));
2521 
2522  // The pointer operand uses an in-tree scalar so we add the new BitCast to
2523  // ExternalUses list to make sure that an extract will be generated in the
2524  // future.
2525  if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2526  ExternalUses.push_back(
2527  ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2528 
2529  unsigned Alignment = LI->getAlignment();
2530  LI = Builder.CreateLoad(VecPtr);
2531  if (!Alignment) {
2532  Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2533  }
2534  LI->setAlignment(Alignment);
2535  E->VectorizedValue = LI;
2536  ++NumVectorInstructions;
2537  return propagateMetadata(LI, E->Scalars);
2538  }
2539  case Instruction::Store: {
2540  StoreInst *SI = cast<StoreInst>(VL0);
2541  unsigned Alignment = SI->getAlignment();
2542  unsigned AS = SI->getPointerAddressSpace();
2543 
2544  ValueList ValueOp;
2545  for (Value *V : E->Scalars)
2546  ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2547 
2548  setInsertPointAfterBundle(E->Scalars);
2549 
2550  Value *VecValue = vectorizeTree(ValueOp);
2551  Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2552  VecTy->getPointerTo(AS));
2553  StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2554 
2555  // The pointer operand uses an in-tree scalar so we add the new BitCast to
2556  // ExternalUses list to make sure that an extract will be generated in the
2557  // future.
2558  if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2559  ExternalUses.push_back(
2560  ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2561 
2562  if (!Alignment) {
2563  Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
2564  }
2565  S->setAlignment(Alignment);
2566  E->VectorizedValue = S;
2567  ++NumVectorInstructions;
2568  return propagateMetadata(S, E->Scalars);
2569  }
2570  case Instruction::GetElementPtr: {
2571  setInsertPointAfterBundle(E->Scalars);
2572 
2573  ValueList Op0VL;
2574  for (Value *V : E->Scalars)
2575  Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2576 
2577  Value *Op0 = vectorizeTree(Op0VL);
2578 
2579  std::vector<Value *> OpVecs;
2580  for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2581  ++j) {
2582  ValueList OpVL;
2583  for (Value *V : E->Scalars)
2584  OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2585 
2586  Value *OpVec = vectorizeTree(OpVL);
2587  OpVecs.push_back(OpVec);
2588  }
2589 
2590  Value *V = Builder.CreateGEP(
2591  cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2592  E->VectorizedValue = V;
2593  ++NumVectorInstructions;
2594 
2595  if (Instruction *I = dyn_cast<Instruction>(V))
2596  return propagateMetadata(I, E->Scalars);
2597 
2598  return V;
2599  }
2600  case Instruction::Call: {
2601  CallInst *CI = cast<CallInst>(VL0);
2602  setInsertPointAfterBundle(E->Scalars);
2603  Function *FI;
2605  Value *ScalarArg = nullptr;
2606  if (CI && (FI = CI->getCalledFunction())) {
2607  IID = FI->getIntrinsicID();
2608  }
2609  std::vector<Value *> OpVecs;
2610  for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2611  ValueList OpVL;
2612  // ctlz,cttz and powi are special intrinsics whose second argument is
2613  // a scalar. This argument should not be vectorized.
2614  if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2615  CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2616  ScalarArg = CEI->getArgOperand(j);
2617  OpVecs.push_back(CEI->getArgOperand(j));
2618  continue;
2619  }
2620  for (Value *V : E->Scalars) {
2621  CallInst *CEI = cast<CallInst>(V);
2622  OpVL.push_back(CEI->getArgOperand(j));
2623  }
2624 
2625  Value *OpVec = vectorizeTree(OpVL);
2626  DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2627  OpVecs.push_back(OpVec);
2628  }
2629 
2630  Module *M = F->getParent();
2632  Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2633  Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2635  CI->getOperandBundlesAsDefs(OpBundles);
2636  Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
2637 
2638  // The scalar argument uses an in-tree scalar so we add the new vectorized
2639  // call to ExternalUses list to make sure that an extract will be
2640  // generated in the future.
2641  if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2642  ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2643 
2644  E->VectorizedValue = V;
2645  propagateIRFlags(E->VectorizedValue, E->Scalars);
2646  ++NumVectorInstructions;
2647  return V;
2648  }
2649  case Instruction::ShuffleVector: {
2650  ValueList LHSVL, RHSVL;
2651  assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2652  reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2653  setInsertPointAfterBundle(E->Scalars);
2654 
2655  Value *LHS = vectorizeTree(LHSVL);
2656  Value *RHS = vectorizeTree(RHSVL);
2657 
2658  if (Value *V = alreadyVectorized(E->Scalars))
2659  return V;
2660 
2661  // Create a vector of LHS op1 RHS
2662  BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2663  Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2664 
2665  // Create a vector of LHS op2 RHS
2666  Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2667  BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2668  Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2669 
2670  // Create shuffle to take alternate operations from the vector.
2671  // Also, gather up odd and even scalar ops to propagate IR flags to
2672  // each vector operation.
2673  ValueList OddScalars, EvenScalars;
2674  unsigned e = E->Scalars.size();
2676  for (unsigned i = 0; i < e; ++i) {
2677  if (i & 1) {
2678  Mask[i] = Builder.getInt32(e + i);
2679  OddScalars.push_back(E->Scalars[i]);
2680  } else {
2681  Mask[i] = Builder.getInt32(i);
2682  EvenScalars.push_back(E->Scalars[i]);
2683  }
2684  }
2685 
2686  Value *ShuffleMask = ConstantVector::get(Mask);
2687  propagateIRFlags(V0, EvenScalars);
2688  propagateIRFlags(V1, OddScalars);
2689 
2690  Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2691  E->VectorizedValue = V;
2692  ++NumVectorInstructions;
2693  if (Instruction *I = dyn_cast<Instruction>(V))
2694  return propagateMetadata(I, E->Scalars);
2695 
2696  return V;
2697  }
2698  default:
2699  llvm_unreachable("unknown inst");
2700  }
2701  return nullptr;
2702 }
2703 
2705 
2706  // All blocks must be scheduled before any instructions are inserted.
2707  for (auto &BSIter : BlocksSchedules) {
2708  scheduleBlock(BSIter.second.get());
2709  }
2710 
2711  Builder.SetInsertPoint(&F->getEntryBlock().front());
2712  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
2713 
2714  // If the vectorized tree can be rewritten in a smaller type, we truncate the
2715  // vectorized root. InstCombine will then rewrite the entire expression. We
2716  // sign extend the extracted values below.
2717  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2718  if (MinBWs.count(ScalarRoot)) {
2719  if (auto *I = dyn_cast<Instruction>(VectorRoot))
2720  Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
2721  auto BundleWidth = VectorizableTree[0].Scalars.size();
2722  auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
2723  auto *VecTy = VectorType::get(MinTy, BundleWidth);
2724  auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
2725  VectorizableTree[0].VectorizedValue = Trunc;
2726  }
2727 
2728  DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2729 
2730  // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
2731  // specified by ScalarType.
2732  auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
2733  if (!MinBWs.count(ScalarRoot))
2734  return Ex;
2735  if (MinBWs[ScalarRoot].second)
2736  return Builder.CreateSExt(Ex, ScalarType);
2737  return Builder.CreateZExt(Ex, ScalarType);
2738  };
2739 
2740  // Extract all of the elements with the external uses.
2741  for (const auto &ExternalUse : ExternalUses) {
2742  Value *Scalar = ExternalUse.Scalar;
2743  llvm::User *User = ExternalUse.User;
2744 
2745  // Skip users that we already RAUW. This happens when one instruction
2746  // has multiple uses of the same value.
2747  if (!is_contained(Scalar->users(), User))
2748  continue;
2749  assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2750 
2751  int Idx = ScalarToTreeEntry[Scalar];
2752  TreeEntry *E = &VectorizableTree[Idx];
2753  assert(!E->NeedToGather && "Extracting from a gather list");
2754 
2755  Value *Vec = E->VectorizedValue;
2756  assert(Vec && "Can't find vectorizable value");
2757 
2758  Value *Lane = Builder.getInt32(ExternalUse.Lane);
2759  // Generate extracts for out-of-tree users.
2760  // Find the insertion point for the extractelement lane.
2761  if (auto *VecI = dyn_cast<Instruction>(Vec)) {
2762  if (PHINode *PH = dyn_cast<PHINode>(User)) {
2763  for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2764  if (PH->getIncomingValue(i) == Scalar) {
2765  TerminatorInst *IncomingTerminator =
2766  PH->getIncomingBlock(i)->getTerminator();
2767  if (isa<CatchSwitchInst>(IncomingTerminator)) {
2768  Builder.SetInsertPoint(VecI->getParent(),
2769  std::next(VecI->getIterator()));
2770  } else {
2771  Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2772  }
2773  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2774  Ex = extend(ScalarRoot, Ex, Scalar->getType());
2775  CSEBlocks.insert(PH->getIncomingBlock(i));
2776  PH->setOperand(i, Ex);
2777  }
2778  }
2779  } else {
2780  Builder.SetInsertPoint(cast<Instruction>(User));
2781  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2782  Ex = extend(ScalarRoot, Ex, Scalar->getType());
2783  CSEBlocks.insert(cast<Instruction>(User)->getParent());
2784  User->replaceUsesOfWith(Scalar, Ex);
2785  }
2786  } else {
2787  Builder.SetInsertPoint(&F->getEntryBlock().front());
2788  Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2789  Ex = extend(ScalarRoot, Ex, Scalar->getType());
2790  CSEBlocks.insert(&F->getEntryBlock());
2791  User->replaceUsesOfWith(Scalar, Ex);
2792  }
2793 
2794  DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2795  }
2796 
2797  // For each vectorized value:
2798  for (TreeEntry &EIdx : VectorizableTree) {
2799  TreeEntry *Entry = &EIdx;
2800 
2801  // For each lane:
2802  for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2803  Value *Scalar = Entry->Scalars[Lane];
2804  // No need to handle users of gathered values.
2805  if (Entry->NeedToGather)
2806  continue;
2807 
2808  assert(Entry->VectorizedValue && "Can't find vectorizable value");
2809 
2810  Type *Ty = Scalar->getType();
2811  if (!Ty->isVoidTy()) {
2812 #ifndef NDEBUG
2813  for (User *U : Scalar->users()) {
2814  DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2815 
2816  assert((ScalarToTreeEntry.count(U) ||
2817  // It is legal to replace users in the ignorelist by undef.
2818  is_contained(UserIgnoreList, U)) &&
2819  "Replacing out-of-tree value with undef");
2820  }
2821 #endif
2822  Value *Undef = UndefValue::get(Ty);
2823  Scalar->replaceAllUsesWith(Undef);
2824  }
2825  DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2826  eraseInstruction(cast<Instruction>(Scalar));
2827  }
2828  }
2829 
2830  Builder.ClearInsertionPoint();
2831 
2832  return VectorizableTree[0].VectorizedValue;
2833 }
2834 
2836  DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2837  << " gather sequences instructions.\n");
2838  // LICM InsertElementInst sequences.
2839  for (Instruction *it : GatherSeq) {
2841 
2842  if (!Insert)
2843  continue;
2844 
2845  // Check if this block is inside a loop.
2846  Loop *L = LI->getLoopFor(Insert->getParent());
2847  if (!L)
2848  continue;
2849 
2850  // Check if it has a preheader.
2851  BasicBlock *PreHeader = L->getLoopPreheader();
2852  if (!PreHeader)
2853  continue;
2854 
2855  // If the vector or the element that we insert into it are
2856  // instructions that are defined in this basic block then we can't
2857  // hoist this instruction.
2858  Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2859  Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2860  if (CurrVec && L->contains(CurrVec))
2861  continue;
2862  if (NewElem && L->contains(NewElem))
2863  continue;
2864 
2865  // We can hoist this instruction. Move it to the pre-header.
2866  Insert->moveBefore(PreHeader->getTerminator());
2867  }
2868 
2869  // Make a list of all reachable blocks in our CSE queue.
2871  CSEWorkList.reserve(CSEBlocks.size());
2872  for (BasicBlock *BB : CSEBlocks)
2873  if (DomTreeNode *N = DT->getNode(BB)) {
2875  CSEWorkList.push_back(N);
2876  }
2877 
2878  // Sort blocks by domination. This ensures we visit a block after all blocks
2879  // dominating it are visited.
2880  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2881  [this](const DomTreeNode *A, const DomTreeNode *B) {
2882  return DT->properlyDominates(A, B);
2883  });
2884 
2885  // Perform O(N^2) search over the gather sequences and merge identical
2886  // instructions. TODO: We can further optimize this scan if we split the
2887  // instructions into different buckets based on the insert lane.
2889  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2890  assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2891  "Worklist not sorted properly!");
2892  BasicBlock *BB = (*I)->getBlock();
2893  // For all instructions in blocks containing gather sequences:
2894  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2895  Instruction *In = &*it++;
2896  if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2897  continue;
2898 
2899  // Check if we can replace this instruction with any of the
2900  // visited instructions.
2901  for (Instruction *v : Visited) {
2902  if (In->isIdenticalTo(v) &&
2903  DT->dominates(v->getParent(), In->getParent())) {
2904  In->replaceAllUsesWith(v);
2905  eraseInstruction(In);
2906  In = nullptr;
2907  break;
2908  }
2909  }
2910  if (In) {
2911  assert(!is_contained(Visited, In));
2912  Visited.push_back(In);
2913  }
2914  }
2915  }
2916  CSEBlocks.clear();
2917  GatherSeq.clear();
2918 }
2919 
2920 // Groups the instructions to a bundle (which is then a single scheduling entity)
2921 // and schedules instructions until the bundle gets ready.
2922 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2923  BoUpSLP *SLP) {
2924  if (isa<PHINode>(VL[0]))
2925  return true;
2926 
2927  // Initialize the instruction bundle.
2928  Instruction *OldScheduleEnd = ScheduleEnd;
2929  ScheduleData *PrevInBundle = nullptr;
2930  ScheduleData *Bundle = nullptr;
2931  bool ReSchedule = false;
2932  DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
2933 
2934  // Make sure that the scheduling region contains all
2935  // instructions of the bundle.
2936  for (Value *V : VL) {
2937  if (!extendSchedulingRegion(V))
2938  return false;
2939  }
2940 
2941  for (Value *V : VL) {
2942  ScheduleData *BundleMember = getScheduleData(V);
2943  assert(BundleMember &&
2944  "no ScheduleData for bundle member (maybe not in same basic block)");
2945  if (BundleMember->IsScheduled) {
2946  // A bundle member was scheduled as single instruction before and now
2947  // needs to be scheduled as part of the bundle. We just get rid of the
2948  // existing schedule.
2949  DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
2950  << " was already scheduled\n");
2951  ReSchedule = true;
2952  }
2953  assert(BundleMember->isSchedulingEntity() &&
2954  "bundle member already part of other bundle");
2955  if (PrevInBundle) {
2956  PrevInBundle->NextInBundle = BundleMember;
2957  } else {
2958  Bundle = BundleMember;
2959  }
2960  BundleMember->UnscheduledDepsInBundle = 0;
2961  Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2962 
2963  // Group the instructions to a bundle.
2964  BundleMember->FirstInBundle = Bundle;
2965  PrevInBundle = BundleMember;
2966  }
2967  if (ScheduleEnd != OldScheduleEnd) {
2968  // The scheduling region got new instructions at the lower end (or it is a
2969  // new region for the first bundle). This makes it necessary to
2970  // recalculate all dependencies.
2971  // It is seldom that this needs to be done a second time after adding the
2972  // initial bundle to the region.
2973  for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2974  ScheduleData *SD = getScheduleData(I);
2975  SD->clearDependencies();
2976  }
2977  ReSchedule = true;
2978  }
2979  if (ReSchedule) {
2980  resetSchedule();
2981  initialFillReadyList(ReadyInsts);
2982  }
2983 
2984  DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2985  << BB->getName() << "\n");
2986 
2987  calculateDependencies(Bundle, true, SLP);
2988 
2989  // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2990  // means that there are no cyclic dependencies and we can schedule it.
2991  // Note that's important that we don't "schedule" the bundle yet (see
2992  // cancelScheduling).
2993  while (!Bundle->isReady() && !ReadyInsts.empty()) {
2994 
2995  ScheduleData *pickedSD = ReadyInsts.back();
2996  ReadyInsts.pop_back();
2997 
2998  if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2999  schedule(pickedSD, ReadyInsts);
3000  }
3001  }
3002  if (!Bundle->isReady()) {
3003  cancelScheduling(VL);
3004  return false;
3005  }
3006  return true;
3007 }
3008 
3009 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
3010  if (isa<PHINode>(VL[0]))
3011  return;
3012 
3013  ScheduleData *Bundle = getScheduleData(VL[0]);
3014  DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
3015  assert(!Bundle->IsScheduled &&
3016  "Can't cancel bundle which is already scheduled");
3017  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
3018  "tried to unbundle something which is not a bundle");
3019 
3020  // Un-bundle: make single instructions out of the bundle.
3021  ScheduleData *BundleMember = Bundle;
3022  while (BundleMember) {
3023  assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
3024  BundleMember->FirstInBundle = BundleMember;
3025  ScheduleData *Next = BundleMember->NextInBundle;
3026  BundleMember->NextInBundle = nullptr;
3027  BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
3028  if (BundleMember->UnscheduledDepsInBundle == 0) {
3029  ReadyInsts.insert(BundleMember);
3030  }
3031  BundleMember = Next;
3032  }
3033 }
3034 
3035 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
3036  if (getScheduleData(V))
3037  return true;
3038  Instruction *I = dyn_cast<Instruction>(V);
3039  assert(I && "bundle member must be an instruction");
3040  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
3041  if (!ScheduleStart) {
3042  // It's the first instruction in the new region.
3043  initScheduleData(I, I->getNextNode(), nullptr, nullptr);
3044  ScheduleStart = I;
3045  ScheduleEnd = I->getNextNode();
3046  assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
3047  DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
3048  return true;
3049  }
3050  // Search up and down at the same time, because we don't know if the new
3051  // instruction is above or below the existing scheduling region.
3053  ++ScheduleStart->getIterator().getReverse();
3054  BasicBlock::reverse_iterator UpperEnd = BB->rend();
3055  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
3056  BasicBlock::iterator LowerEnd = BB->end();
3057  for (;;) {
3058  if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
3059  DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
3060  return false;
3061  }
3062 
3063  if (UpIter != UpperEnd) {
3064  if (&*UpIter == I) {
3065  initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
3066  ScheduleStart = I;
3067  DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
3068  return true;
3069  }
3070  UpIter++;
3071  }
3072  if (DownIter != LowerEnd) {
3073  if (&*DownIter == I) {
3074  initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
3075  nullptr);
3076  ScheduleEnd = I->getNextNode();
3077  assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
3078  DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
3079  return true;
3080  }
3081  DownIter++;
3082  }
3083  assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
3084  "instruction not found in block");
3085  }
3086  return true;
3087 }
3088 
3089 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
3090  Instruction *ToI,
3091  ScheduleData *PrevLoadStore,
3092  ScheduleData *NextLoadStore) {
3093  ScheduleData *CurrentLoadStore = PrevLoadStore;
3094  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
3095  ScheduleData *SD = ScheduleDataMap[I];
3096  if (!SD) {
3097  // Allocate a new ScheduleData for the instruction.
3098  if (ChunkPos >= ChunkSize) {
3099  ScheduleDataChunks.push_back(
3100  llvm::make_unique<ScheduleData[]>(ChunkSize));
3101  ChunkPos = 0;
3102  }
3103  SD = &(ScheduleDataChunks.back()[ChunkPos++]);
3104  ScheduleDataMap[I] = SD;
3105  SD->Inst = I;
3106  }
3107  assert(!isInSchedulingRegion(SD) &&
3108  "new ScheduleData already in scheduling region");
3109  SD->init(SchedulingRegionID);
3110 
3111  if (I->mayReadOrWriteMemory()) {
3112  // Update the linked list of memory accessing instructions.
3113  if (CurrentLoadStore) {
3114  CurrentLoadStore->NextLoadStore = SD;
3115  } else {
3116  FirstLoadStoreInRegion = SD;
3117  }
3118  CurrentLoadStore = SD;
3119  }
3120  }
3121  if (NextLoadStore) {
3122  if (CurrentLoadStore)
3123  CurrentLoadStore->NextLoadStore = NextLoadStore;
3124  } else {
3125  LastLoadStoreInRegion = CurrentLoadStore;
3126  }
3127 }
3128 
3129 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
3130  bool InsertInReadyList,
3131  BoUpSLP *SLP) {
3132  assert(SD->isSchedulingEntity());
3133 
3135  WorkList.push_back(SD);
3136 
3137  while (!WorkList.empty()) {
3138  ScheduleData *SD = WorkList.back();
3139  WorkList.pop_back();
3140 
3141  ScheduleData *BundleMember = SD;
3142  while (BundleMember) {
3143  assert(isInSchedulingRegion(BundleMember));
3144  if (!BundleMember->hasValidDependencies()) {
3145 
3146  DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
3147  BundleMember->Dependencies = 0;
3148  BundleMember->resetUnscheduledDeps();
3149 
3150  // Handle def-use chain dependencies.
3151  for (User *U : BundleMember->Inst->users()) {
3152  if (isa<Instruction>(U)) {
3153  ScheduleData *UseSD = getScheduleData(U);
3154  if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3155  BundleMember->Dependencies++;
3156  ScheduleData *DestBundle = UseSD->FirstInBundle;
3157  if (!DestBundle->IsScheduled) {
3158  BundleMember->incrementUnscheduledDeps(1);
3159  }
3160  if (!DestBundle->hasValidDependencies()) {
3161  WorkList.push_back(DestBundle);
3162  }
3163  }
3164  } else {
3165  // I'm not sure if this can ever happen. But we need to be safe.
3166  // This lets the instruction/bundle never be scheduled and
3167  // eventually disable vectorization.
3168  BundleMember->Dependencies++;
3169  BundleMember->incrementUnscheduledDeps(1);
3170  }
3171  }
3172 
3173  // Handle the memory dependencies.
3174  ScheduleData *DepDest = BundleMember->NextLoadStore;
3175  if (DepDest) {
3176  Instruction *SrcInst = BundleMember->Inst;
3177  MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
3178  bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
3179  unsigned numAliased = 0;
3180  unsigned DistToSrc = 1;
3181 
3182  while (DepDest) {
3183  assert(isInSchedulingRegion(DepDest));
3184 
3185  // We have two limits to reduce the complexity:
3186  // 1) AliasedCheckLimit: It's a small limit to reduce calls to
3187  // SLP->isAliased (which is the expensive part in this loop).
3188  // 2) MaxMemDepDistance: It's for very large blocks and it aborts
3189  // the whole loop (even if the loop is fast, it's quadratic).
3190  // It's important for the loop break condition (see below) to
3191  // check this limit even between two read-only instructions.
3192  if (DistToSrc >= MaxMemDepDistance ||
3193  ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
3194  (numAliased >= AliasedCheckLimit ||
3195  SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
3196 
3197  // We increment the counter only if the locations are aliased
3198  // (instead of counting all alias checks). This gives a better
3199  // balance between reduced runtime and accurate dependencies.
3200  numAliased++;
3201 
3202  DepDest->MemoryDependencies.push_back(BundleMember);
3203  BundleMember->Dependencies++;
3204  ScheduleData *DestBundle = DepDest->FirstInBundle;
3205  if (!DestBundle->IsScheduled) {
3206  BundleMember->incrementUnscheduledDeps(1);
3207  }
3208  if (!DestBundle->hasValidDependencies()) {
3209  WorkList.push_back(DestBundle);
3210  }
3211  }
3212  DepDest = DepDest->NextLoadStore;
3213 
3214  // Example, explaining the loop break condition: Let's assume our
3215  // starting instruction is i0 and MaxMemDepDistance = 3.
3216  //
3217  // +--------v--v--v
3218  // i0,i1,i2,i3,i4,i5,i6,i7,i8
3219  // +--------^--^--^
3220  //
3221  // MaxMemDepDistance let us stop alias-checking at i3 and we add
3222  // dependencies from i0 to i3,i4,.. (even if they are not aliased).
3223  // Previously we already added dependencies from i3 to i6,i7,i8
3224  // (because of MaxMemDepDistance). As we added a dependency from
3225  // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
3226  // and we can abort this loop at i6.
3227  if (DistToSrc >= 2 * MaxMemDepDistance)
3228  break;
3229  DistToSrc++;
3230  }
3231  }
3232  }
3233  BundleMember = BundleMember->NextInBundle;
3234  }
3235  if (InsertInReadyList && SD->isReady()) {
3236  ReadyInsts.push_back(SD);
3237  DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
3238  }
3239  }
3240 }
3241 
3242 void BoUpSLP::BlockScheduling::resetSchedule() {
3243  assert(ScheduleStart &&
3244  "tried to reset schedule on block which has not been scheduled");
3245  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3246  ScheduleData *SD = getScheduleData(I);
3247  assert(isInSchedulingRegion(SD));
3248  SD->IsScheduled = false;
3249  SD->resetUnscheduledDeps();
3250  }
3251  ReadyInsts.clear();
3252 }
3253 
3254 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
3255 
3256  if (!BS->ScheduleStart)
3257  return;
3258 
3259  DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
3260 
3261  BS->resetSchedule();
3262 
3263  // For the real scheduling we use a more sophisticated ready-list: it is
3264  // sorted by the original instruction location. This lets the final schedule
3265  // be as close as possible to the original instruction order.
3266  struct ScheduleDataCompare {
3267  bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3268  return SD2->SchedulingPriority < SD1->SchedulingPriority;
3269  }
3270  };
3271  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3272 
3273  // Ensure that all dependency data is updated and fill the ready-list with
3274  // initial instructions.
3275  int Idx = 0;
3276  int NumToSchedule = 0;
3277  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3278  I = I->getNextNode()) {
3279  ScheduleData *SD = BS->getScheduleData(I);
3280  assert(
3281  SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3282  "scheduler and vectorizer have different opinion on what is a bundle");
3283  SD->FirstInBundle->SchedulingPriority = Idx++;
3284  if (SD->isSchedulingEntity()) {
3285  BS->calculateDependencies(SD, false, this);
3286  NumToSchedule++;
3287  }
3288  }
3289  BS->initialFillReadyList(ReadyInsts);
3290 
3291  Instruction *LastScheduledInst = BS->ScheduleEnd;
3292 
3293  // Do the "real" scheduling.
3294  while (!ReadyInsts.empty()) {
3295  ScheduleData *picked = *ReadyInsts.begin();
3296  ReadyInsts.erase(ReadyInsts.begin());
3297 
3298  // Move the scheduled instruction(s) to their dedicated places, if not
3299  // there yet.
3300  ScheduleData *BundleMember = picked;
3301  while (BundleMember) {
3302  Instruction *pickedInst = BundleMember->Inst;
3303  if (LastScheduledInst->getNextNode() != pickedInst) {
3304  BS->BB->getInstList().remove(pickedInst);
3305  BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
3306  pickedInst);
3307  }
3308  LastScheduledInst = pickedInst;
3309  BundleMember = BundleMember->NextInBundle;
3310  }
3311 
3312  BS->schedule(picked, ReadyInsts);
3313  NumToSchedule--;
3314  }
3315  assert(NumToSchedule == 0 && "could not schedule all instructions");
3316 
3317  // Avoid duplicate scheduling of the block.
3318  BS->ScheduleStart = nullptr;
3319 }
3320 
3322  // If V is a store, just return the width of the stored value without
3323  // traversing the expression tree. This is the common case.
3324  if (auto *Store = dyn_cast<StoreInst>(V))
3325  return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
3326 
3327  // If V is not a store, we can traverse the expression tree to find loads
3328  // that feed it. The type of the loaded value may indicate a more suitable
3329  // width than V's type. We want to base the vector element size on the width
3330  // of memory operations where possible.
3333  if (auto *I = dyn_cast<Instruction>(V))
3334  Worklist.push_back(I);
3335 
3336  // Traverse the expression tree in bottom-up order looking for loads. If we
3337  // encounter an instruciton we don't yet handle, we give up.
3338  auto MaxWidth = 0u;
3339  auto FoundUnknownInst = false;
3340  while (!Worklist.empty() && !FoundUnknownInst) {
3341  auto *I = Worklist.pop_back_val();
3342  Visited.insert(I);
3343 
3344  // We should only be looking at scalar instructions here. If the current
3345  // instruction has a vector type, give up.
3346  auto *Ty = I->getType();
3347  if (isa<VectorType>(Ty))
3348  FoundUnknownInst = true;
3349 
3350  // If the current instruction is a load, update MaxWidth to reflect the
3351  // width of the loaded value.
3352  else if (isa<LoadInst>(I))
3353  MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
3354 
3355  // Otherwise, we need to visit the operands of the instruction. We only
3356  // handle the interesting cases from buildTree here. If an operand is an
3357  // instruction we haven't yet visited, we add it to the worklist.
3358  else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
3359  isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
3360  for (Use &U : I->operands())
3361  if (auto *J = dyn_cast<Instruction>(U.get()))
3362  if (!Visited.count(J))
3363  Worklist.push_back(J);
3364  }
3365 
3366  // If we don't yet handle the instruction, give up.
3367  else
3368  FoundUnknownInst = true;
3369  }
3370 
3371  // If we didn't encounter a memory access in the expression tree, or if we
3372  // gave up for some reason, just return the width of V.
3373  if (!MaxWidth || FoundUnknownInst)
3374  return DL->getTypeSizeInBits(V->getType());
3375 
3376  // Otherwise, return the maximum width we found.
3377  return MaxWidth;
3378 }
3379 
3380 // Determine if a value V in a vectorizable expression Expr can be demoted to a
3381 // smaller type with a truncation. We collect the values that will be demoted
3382 // in ToDemote and additional roots that require investigating in Roots.
3384  SmallVectorImpl<Value *> &ToDemote,
3385  SmallVectorImpl<Value *> &Roots) {
3386 
3387  // We can always demote constants.
3388  if (isa<Constant>(V)) {
3389  ToDemote.push_back(V);
3390  return true;
3391  }
3392 
3393  // If the value is not an instruction in the expression with only one use, it
3394  // cannot be demoted.
3395  auto *I = dyn_cast<Instruction>(V);
3396  if (!I || !I->hasOneUse() || !Expr.count(I))
3397  return false;
3398 
3399  switch (I->getOpcode()) {
3400 
3401  // We can always demote truncations and extensions. Since truncations can
3402  // seed additional demotion, we save the truncated value.
3403  case Instruction::Trunc:
3404  Roots.push_back(I->getOperand(0));
3405  case Instruction::ZExt:
3406  case Instruction::SExt:
3407  break;
3408 
3409  // We can demote certain binary operations if we can demote both of their
3410  // operands.
3411  case Instruction::Add:
3412  case Instruction::Sub:
3413  case Instruction::Mul:
3414  case Instruction::And:
3415  case Instruction::Or:
3416  case Instruction::Xor:
3417  if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
3418  !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
3419  return false;
3420  break;
3421 
3422  // We can demote selects if we can demote their true and false values.
3423  case Instruction::Select: {
3424  SelectInst *SI = cast<SelectInst>(I);
3425  if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
3426  !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
3427  return false;
3428  break;
3429  }
3430 
3431  // We can demote phis if we can demote all their incoming operands. Note that
3432  // we don't need to worry about cycles since we ensure single use above.
3433  case Instruction::PHI: {
3434  PHINode *PN = cast<PHINode>(I);
3435  for (Value *IncValue : PN->incoming_values())
3436  if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
3437  return false;
3438  break;
3439  }
3440 
3441  // Otherwise, conservatively give up.
3442  default:
3443  return false;
3444  }
3445 
3446  // Record the value that we can demote.
3447  ToDemote.push_back(V);
3448  return true;
3449 }
3450 
3452  // If there are no external uses, the expression tree must be rooted by a
3453  // store. We can't demote in-memory values, so there is nothing to do here.
3454  if (ExternalUses.empty())
3455  return;
3456 
3457  // We only attempt to truncate integer expressions.
3458  auto &TreeRoot = VectorizableTree[0].Scalars;
3459  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
3460  if (!TreeRootIT)
3461  return;
3462 
3463  // If the expression is not rooted by a store, these roots should have
3464  // external uses. We will rely on InstCombine to rewrite the expression in
3465  // the narrower type. However, InstCombine only rewrites single-use values.
3466  // This means that if a tree entry other than a root is used externally, it
3467  // must have multiple uses and InstCombine will not rewrite it. The code
3468  // below ensures that only the roots are used externally.
3469  SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
3470  for (auto &EU : ExternalUses)
3471  if (!Expr.erase(EU.Scalar))
3472  return;
3473  if (!Expr.empty())
3474  return;
3475 
3476  // Collect the scalar values of the vectorizable expression. We will use this
3477  // context to determine which values can be demoted. If we see a truncation,
3478  // we mark it as seeding another demotion.
3479  for (auto &Entry : VectorizableTree)
3480  Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
3481 
3482  // Ensure the roots of the vectorizable tree don't form a cycle. They must
3483  // have a single external user that is not in the vectorizable tree.
3484  for (auto *Root : TreeRoot)
3485  if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
3486  return;
3487 
3488  // Conservatively determine if we can actually truncate the roots of the
3489  // expression. Collect the values that can be demoted in ToDemote and
3490  // additional roots that require investigating in Roots.
3491  SmallVector<Value *, 32> ToDemote;
3493  for (auto *Root : TreeRoot)
3494  if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
3495  return;
3496 
3497  // The maximum bit width required to represent all the values that can be
3498  // demoted without loss of precision. It would be safe to truncate the roots
3499  // of the expression to this width.
3500  auto MaxBitWidth = 8u;
3501 
3502  // We first check if all the bits of the roots are demanded. If they're not,
3503  // we can truncate the roots to this narrower type.
3504  for (auto *Root : TreeRoot) {
3505  auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
3506  MaxBitWidth = std::max<unsigned>(
3507  Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
3508  }
3509 
3510  // True if the roots can be zero-extended back to their original type, rather
3511  // than sign-extended. We know that if the leading bits are not demanded, we
3512  // can safely zero-extend. So we initialize IsKnownPositive to True.
3513  bool IsKnownPositive = true;
3514 
3515  // If all the bits of the roots are demanded, we can try a little harder to
3516  // compute a narrower type. This can happen, for example, if the roots are
3517  // getelementptr indices. InstCombine promotes these indices to the pointer
3518  // width. Thus, all their bits are technically demanded even though the
3519  // address computation might be vectorized in a smaller type.
3520  //
3521  // We start by looking at each entry that can be demoted. We compute the
3522  // maximum bit width required to store the scalar by using ValueTracking to
3523  // compute the number of high-order bits we can truncate.
3524  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
3525  MaxBitWidth = 8u;
3526 
3527  // Determine if the sign bit of all the roots is known to be zero. If not,
3528  // IsKnownPositive is set to False.
3529  IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
3530  bool KnownZero = false;
3531  bool KnownOne = false;
3532  ComputeSignBit(R, KnownZero, KnownOne, *DL);
3533  return KnownZero;
3534  });
3535 
3536  // Determine the maximum number of bits required to store the scalar
3537  // values.
3538  for (auto *Scalar : ToDemote) {
3539  auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
3540  auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
3541  MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
3542  }
3543 
3544  // If we can't prove that the sign bit is zero, we must add one to the
3545  // maximum bit width to account for the unknown sign bit. This preserves
3546  // the existing sign bit so we can safely sign-extend the root back to the
3547  // original type. Otherwise, if we know the sign bit is zero, we will
3548  // zero-extend the root instead.
3549  //
3550  // FIXME: This is somewhat suboptimal, as there will be cases where adding
3551  // one to the maximum bit width will yield a larger-than-necessary
3552  // type. In general, we need to add an extra bit only if we can't
3553  // prove that the upper bit of the original type is equal to the
3554  // upper bit of the proposed smaller type. If these two bits are the
3555  // same (either zero or one) we know that sign-extending from the
3556  // smaller type will result in the same value. Here, since we can't
3557  // yet prove this, we are just making the proposed smaller type
3558  // larger to ensure correctness.
3559  if (!IsKnownPositive)
3560  ++MaxBitWidth;
3561  }
3562 
3563  // Round MaxBitWidth up to the next power-of-two.
3564  if (!isPowerOf2_64(MaxBitWidth))
3565  MaxBitWidth = NextPowerOf2(MaxBitWidth);
3566 
3567  // If the maximum bit width we compute is less than the with of the roots'
3568  // type, we can proceed with the narrowing. Otherwise, do nothing.
3569  if (MaxBitWidth >= TreeRootIT->getBitWidth())
3570  return;
3571 
3572  // If we can truncate the root, we must collect additional values that might
3573  // be demoted as a result. That is, those seeded by truncations we will
3574  // modify.
3575  while (!Roots.empty())
3576  collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
3577 
3578  // Finally, map the values we can demote to the maximum bit with we computed.
3579  for (auto *Scalar : ToDemote)
3580  MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
3581 }
3582 
3583 namespace {
3584 /// The SLPVectorizer Pass.
3585 struct SLPVectorizer : public FunctionPass {
3586  SLPVectorizerPass Impl;
3587 
3588  /// Pass identification, replacement for typeid
3589  static char ID;
3590 
3591  explicit SLPVectorizer() : FunctionPass(ID) {
3593  }
3594 
3595 
3596  bool doInitialization(Module &M) override {
3597  return false;
3598  }
3599 
3600  bool runOnFunction(Function &F) override {
3601  if (skipFunction(F))
3602  return false;
3603 
3604  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3605  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3606  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3607  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
3608  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3609  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3610  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3611  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3612  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
3613 
3614  return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
3615  }
3616 
3617  void getAnalysisUsage(AnalysisUsage &AU) const override {
3630  AU.setPreservesCFG();
3631  }
3632 };
3633 } // end anonymous namespace
3634 
3636  auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
3637  auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
3638  auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
3639  auto *AA = &AM.getResult<AAManager>(F);
3640  auto *LI = &AM.getResult<LoopAnalysis>(F);
3641  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
3642  auto *AC = &AM.getResult<AssumptionAnalysis>(F);
3643  auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
3644 
3645  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
3646  if (!Changed)
3647  return PreservedAnalyses::all();
3648  PreservedAnalyses PA;
3649  PA.preserve<LoopAnalysis>();
3651  PA.preserve<AAManager>();
3652  PA.preserve<GlobalsAA>();
3653  return PA;
3654 }
3655 
3657  TargetTransformInfo *TTI_,
3658  TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
3659  LoopInfo *LI_, DominatorTree *DT_,
3660  AssumptionCache *AC_, DemandedBits *DB_) {
3661  SE = SE_;
3662  TTI = TTI_;
3663  TLI = TLI_;
3664  AA = AA_;
3665  LI = LI_;
3666  DT = DT_;
3667  AC = AC_;
3668  DB = DB_;
3669  DL = &F.getParent()->getDataLayout();
3670 
3671  Stores.clear();
3672  GEPs.clear();
3673  bool Changed = false;
3674 
3675  // If the target claims to have no vector registers don't attempt
3676  // vectorization.
3677  if (!TTI->getNumberOfRegisters(true))
3678  return false;
3679 
3680  // Don't vectorize when the attribute NoImplicitFloat is used.
3681  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
3682  return false;
3683 
3684  DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3685 
3686  // Use the bottom up slp vectorizer to construct chains that start with
3687  // store instructions.
3688  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
3689 
3690  // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3691  // delete instructions.
3692 
3693  // Scan the blocks in the function in post order.
3694  for (auto BB : post_order(&F.getEntryBlock())) {
3695  collectSeedInstructions(BB);
3696 
3697  // Vectorize trees that end at stores.
3698  if (!Stores.empty()) {
3699  DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
3700  << " underlying objects.\n");
3701  Changed |= vectorizeStoreChains(R);
3702  }
3703 
3704  // Vectorize trees that end at reductions.
3705  Changed |= vectorizeChainsInBlock(BB, R);
3706 
3707  // Vectorize the index computations of getelementptr instructions. This
3708  // is primarily intended to catch gather-like idioms ending at
3709  // non-consecutive loads.
3710  if (!GEPs.empty()) {
3711  DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
3712  << " underlying objects.\n");
3713  Changed |= vectorizeGEPIndices(BB, R);
3714  }
3715  }
3716 
3717  if (Changed) {
3718  R.optimizeGatherSequence();
3719  DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3720  DEBUG(verifyFunction(F));
3721  }
3722  return Changed;
3723 }
3724 
3725 /// \brief Check that the Values in the slice in VL array are still existent in
3726 /// the WeakVH array.
3727 /// Vectorization of part of the VL array may cause later values in the VL array
3728 /// to become invalid. We track when this has happened in the WeakVH array.
3730  unsigned SliceBegin, unsigned SliceSize) {
3731  VL = VL.slice(SliceBegin, SliceSize);
3732  VH = VH.slice(SliceBegin, SliceSize);
3733  return !std::equal(VL.begin(), VL.end(), VH.begin());
3734 }
3735 
3736 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
3737  unsigned VecRegSize) {
3738  unsigned ChainLen = Chain.size();
3739  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3740  << "\n");
3741  unsigned Sz = R.getVectorElementSize(Chain[0]);
3742  unsigned VF = VecRegSize / Sz;
3743 
3744  if (!isPowerOf2_32(Sz) || VF < 2)
3745  return false;
3746 
3747  // Keep track of values that were deleted by vectorizing in the loop below.
3748  SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3749 
3750  bool Changed = false;
3751  // Look for profitable vectorizable trees at all offsets, starting at zero.
3752  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3753  if (i + VF > e)
3754  break;
3755 
3756  // Check that a previous iteration of this loop did not delete the Value.
3757  if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3758  continue;
3759 
3760  DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3761  << "\n");
3762  ArrayRef<Value *> Operands = Chain.slice(i, VF);
3763 
3764  R.buildTree(Operands);
3766  continue;
3767 
3769 
3770  int Cost = R.getTreeCost();
3771 
3772  DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3773  if (Cost < -SLPCostThreshold) {
3774  DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3775  R.vectorizeTree();
3776 
3777  // Move to the next bundle.
3778  i += VF - 1;
3779  Changed = true;
3780  }
3781  }
3782 
3783  return Changed;
3784 }
3785 
3786 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
3787  BoUpSLP &R) {
3788  SetVector<StoreInst *> Heads, Tails;
3789  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3790 
3791  // We may run into multiple chains that merge into a single chain. We mark the
3792  // stores that we vectorized so that we don't visit the same store twice.
3793  BoUpSLP::ValueSet VectorizedStores;
3794  bool Changed = false;
3795 
3796  // Do a quadratic search on all of the given stores and find
3797  // all of the pairs of stores that follow each other.
3798  SmallVector<unsigned, 16> IndexQueue;
3799  for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3800  IndexQueue.clear();
3801  // If a store has multiple consecutive store candidates, search Stores
3802  // array according to the sequence: from i+1 to e, then from i-1 to 0.
3803  // This is because usually pairing with immediate succeeding or preceding
3804  // candidate create the best chance to find slp vectorization opportunity.
3805  unsigned j = 0;
3806  for (j = i + 1; j < e; ++j)
3807  IndexQueue.push_back(j);
3808  for (j = i; j > 0; --j)
3809  IndexQueue.push_back(j - 1);
3810 
3811  for (auto &k : IndexQueue) {
3812  if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
3813  Tails.insert(Stores[k]);
3814  Heads.insert(Stores[i]);
3815  ConsecutiveChain[Stores[i]] = Stores[k];
3816  break;
3817  }
3818  }
3819  }
3820 
3821  // For stores that start but don't end a link in the chain:
3822  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
3823  it != e; ++it) {
3824  if (Tails.count(*it))
3825  continue;
3826 
3827  // We found a store instr that starts a chain. Now follow the chain and try
3828  // to vectorize it.
3829  BoUpSLP::ValueList Operands;
3830  StoreInst *I = *it;
3831  // Collect the chain into a list.
3832  while (Tails.count(I) || Heads.count(I)) {
3833  if (VectorizedStores.count(I))
3834  break;
3835  Operands.push_back(I);
3836  // Move to the next value in the chain.
3837  I = ConsecutiveChain[I];
3838  }
3839 
3840  // FIXME: Is division-by-2 the correct step? Should we assert that the
3841  // register size is a power-of-2?
3842  for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
3843  Size /= 2) {
3844  if (vectorizeStoreChain(Operands, R, Size)) {
3845  // Mark the vectorized stores so that we don't vectorize them again.
3846  VectorizedStores.insert(Operands.begin(), Operands.end());
3847  Changed = true;
3848  break;
3849  }
3850  }
3851  }
3852 
3853  return Changed;
3854 }
3855 
3856 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
3857 
3858  // Initialize the collections. We will make a single pass over the block.
3859  Stores.clear();
3860  GEPs.clear();
3861 
3862  // Visit the store and getelementptr instructions in BB and organize them in
3863  // Stores and GEPs according to the underlying objects of their pointer
3864  // operands.
3865  for (Instruction &I : *BB) {
3866 
3867  // Ignore store instructions that are volatile or have a pointer operand
3868  // that doesn't point to a scalar type.
3869  if (auto *SI = dyn_cast<StoreInst>(&I)) {
3870  if (!SI->isSimple())
3871  continue;
3873  continue;
3874  Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
3875  }
3876 
3877  // Ignore getelementptr instructions that have more than one index, a
3878  // constant index, or a pointer operand that doesn't point to a scalar
3879  // type.
3880  else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
3881  auto Idx = GEP->idx_begin()->get();
3882  if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
3883  continue;
3884  if (!isValidElementType(Idx->getType()))
3885  continue;
3886  if (GEP->getType()->isVectorTy())
3887  continue;
3888  GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
3889  }
3890  }
3891 }
3892 
3893 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3894  if (!A || !B)
3895  return false;
3896  Value *VL[] = { A, B };
3897  return tryToVectorizeList(VL, R, None, true);
3898 }
3899 
3900 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3901  ArrayRef<Value *> BuildVector,
3902  bool AllowReorder) {
3903  if (VL.size() < 2)
3904  return false;
3905 
3906  DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
3907  << ".\n");
3908 
3909  // Check that all of the parts are scalar instructions of the same type.
3910  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3911  if (!I0)
3912  return false;
3913 
3914  unsigned Opcode0 = I0->getOpcode();
3915 
3916  unsigned Sz = R.getVectorElementSize(I0);
3917  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
3918  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
3919  if (MaxVF < 2)
3920  return false;
3921 
3922  for (Value *V : VL) {
3923  Type *Ty = V->getType();
3924  if (!isValidElementType(Ty))
3925  return false;
3926  Instruction *Inst = dyn_cast<Instruction>(V);
3927  if (!Inst || Inst->getOpcode() != Opcode0)
3928  return false;
3929  }
3930 
3931  bool Changed = false;
3932 
3933  // Keep track of values that were deleted by vectorizing in the loop below.
3934  SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3935 
3936  unsigned NextInst = 0, MaxInst = VL.size();
3937  for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
3938  VF /= 2) {
3939  // No actual vectorization should happen, if number of parts is the same as
3940  // provided vectorization factor (i.e. the scalar type is used for vector
3941  // code during codegen).
3942  auto *VecTy = VectorType::get(VL[0]->getType(), VF);
3943  if (TTI->getNumberOfParts(VecTy) == VF)
3944  continue;
3945  for (unsigned I = NextInst; I < MaxInst; ++I) {
3946  unsigned OpsWidth = 0;
3947 
3948  if (I + VF > MaxInst)
3949  OpsWidth = MaxInst - I;
3950  else
3951  OpsWidth = VF;
3952 
3953  if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3954  break;
3955 
3956  // Check that a previous iteration of this loop did not delete the Value.
3957  if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
3958  continue;
3959 
3960  DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3961  << "\n");
3962  ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
3963 
3964  ArrayRef<Value *> BuildVectorSlice;
3965  if (!BuildVector.empty())
3966  BuildVectorSlice = BuildVector.slice(I, OpsWidth);
3967 
3968  R.buildTree(Ops, BuildVectorSlice);
3969  // TODO: check if we can allow reordering for more cases.
3970  if (AllowReorder && R.shouldReorder()) {
3971  // Conceptually, there is nothing actually preventing us from trying to
3972  // reorder a larger list. In fact, we do exactly this when vectorizing
3973  // reductions. However, at this point, we only expect to get here from
3974  // tryToVectorizePair().
3975  assert(Ops.size() == 2);
3976  assert(BuildVectorSlice.empty());
3977  Value *ReorderedOps[] = {Ops[1], Ops[0]};
3978  R.buildTree(ReorderedOps, None);
3979  }
3981  continue;
3982 
3984  int Cost = R.getTreeCost();
3985 
3986  if (Cost < -SLPCostThreshold) {
3987  DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3988  Value *VectorizedRoot = R.vectorizeTree();
3989 
3990  // Reconstruct the build vector by extracting the vectorized root. This
3991  // way we handle the case where some elements of the vector are
3992  // undefined.
3993  // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3994  if (!BuildVectorSlice.empty()) {
3995  // The insert point is the last build vector instruction. The
3996  // vectorized root will precede it. This guarantees that we get an
3997  // instruction. The vectorized tree could have been constant folded.
3998  Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3999  unsigned VecIdx = 0;
4000  for (auto &V : BuildVectorSlice) {
4001  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
4002  ++BasicBlock::iterator(InsertAfter));
4003  Instruction *I = cast<Instruction>(V);
4004  assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
4005  Instruction *Extract =
4006  cast<Instruction>(Builder.CreateExtractElement(
4007  VectorizedRoot, Builder.getInt32(VecIdx++)));
4008  I->setOperand(1, Extract);
4009  I->removeFromParent();
4010  I->insertAfter(Extract);
4011  InsertAfter = I;
4012  }
4013  }
4014  // Move to the next bundle.
4015  I += VF - 1;
4016  NextInst = I + 1;
4017  Changed = true;
4018  }
4019  }
4020  }
4021 
4022  return Changed;
4023 }
4024 
4025 bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
4026  if (!V)
4027  return false;
4028 
4029  // Try to vectorize V.
4030  if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
4031  return true;
4032 
4035  // Try to skip B.
4036  if (B && B->hasOneUse()) {
4037  BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
4038  BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
4039  if (tryToVectorizePair(A, B0, R)) {
4040  return true;
4041  }
4042  if (tryToVectorizePair(A, B1, R)) {
4043  return true;
4044  }
4045  }
4046 
4047  // Try to skip A.
4048  if (A && A->hasOneUse()) {
4051  if (tryToVectorizePair(A0, B, R)) {
4052  return true;
4053  }
4054  if (tryToVectorizePair(A1, B, R)) {
4055  return true;
4056  }
4057  }
4058  return 0;
4059 }
4060 
4061 /// \brief Generate a shuffle mask to be used in a reduction tree.
4062 ///
4063 /// \param VecLen The length of the vector to be reduced.
4064 /// \param NumEltsToRdx The number of elements that should be reduced in the
4065 /// vector.
4066 /// \param IsPairwise Whether the reduction is a pairwise or splitting
4067 /// reduction. A pairwise reduction will generate a mask of
4068 /// <0,2,...> or <1,3,..> while a splitting reduction will generate
4069 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
4070 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
4071 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
4072  bool IsPairwise, bool IsLeft,
4073  IRBuilder<> &Builder) {
4074  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
4075 
4076  SmallVector<Constant *, 32> ShuffleMask(
4077  VecLen, UndefValue::get(Builder.getInt32Ty()));
4078 
4079  if (IsPairwise)
4080  // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
4081  for (unsigned i = 0; i != NumEltsToRdx; ++i)
4082  ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
4083  else
4084  // Move the upper half of the vector to the lower half.
4085  for (unsigned i = 0; i != NumEltsToRdx; ++i)
4086  ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
4087 
4088  return ConstantVector::get(ShuffleMask);
4089 }
4090 
4091 namespace {
4092 /// Model horizontal reductions.
4093 ///
4094 /// A horizontal reduction is a tree of reduction operations (currently add and
4095 /// fadd) that has operations that can be put into a vector as its leaf.
4096 /// For example, this tree:
4097 ///
4098 /// mul mul mul mul
4099 /// \ / \ /
4100 /// + +
4101 /// \ /
4102 /// +
4103 /// This tree has "mul" as its reduced values and "+" as its reduction
4104 /// operations. A reduction might be feeding into a store or a binary operation
4105 /// feeding a phi.
4106 /// ...
4107 /// \ /
4108 /// +
4109 /// |
4110 /// phi +=
4111 ///
4112 /// Or:
4113 /// ...
4114 /// \ /
4115 /// +
4116 /// |
4117 /// *p =
4118 ///
4119 class HorizontalReduction {
4120  SmallVector<Value *, 16> ReductionOps;
4121  SmallVector<Value *, 32> ReducedVals;
4122 
4123  BinaryOperator *ReductionRoot;
4124  // After successfull horizontal reduction vectorization attempt for PHI node
4125  // vectorizer tries to update root binary op by combining vectorized tree and
4126  // the ReductionPHI node. But during vectorization this ReductionPHI can be
4127  // vectorized itself and replaced by the undef value, while the instruction
4128  // itself is marked for deletion. This 'marked for deletion' PHI node then can
4129  // be used in new binary operation, causing "Use still stuck around after Def
4130  // is destroyed" crash upon PHI node deletion.
4131  WeakVH ReductionPHI;
4132 
4133  /// The opcode of the reduction.
4134  unsigned ReductionOpcode;
4135  /// The opcode of the values we perform a reduction on.
4136  unsigned ReducedValueOpcode;
4137  /// Should we model this reduction as a pairwise reduction tree or a tree that
4138  /// splits the vector in halves and adds those halves.
4139  bool IsPairwiseReduction;
4140 
4141 public:
4142  /// The width of one full horizontal reduction operation.
4143  unsigned ReduxWidth;
4144 
4145  /// Minimal width of available vector registers. It's used to determine
4146  /// ReduxWidth.
4147  unsigned MinVecRegSize;
4148 
4149  HorizontalReduction(unsigned MinVecRegSize)
4150  : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
4151  IsPairwiseReduction(false), ReduxWidth(0),
4152  MinVecRegSize(MinVecRegSize) {}
4153 
4154  /// \brief Try to find a reduction tree.
4155  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
4156  assert((!Phi || is_contained(Phi->operands(), B)) &&
4157  "Thi phi needs to use the binary operator");
4158 
4159  // We could have a initial reductions that is not an add.
4160  // r *= v1 + v2 + v3 + v4
4161  // In such a case start looking for a tree rooted in the first '+'.
4162  if (Phi) {
4163  if (B->getOperand(0) == Phi) {
4164  Phi = nullptr;
4165  B = dyn_cast<BinaryOperator>(B->getOperand(1));
4166  } else if (B->getOperand(1) == Phi) {
4167  Phi = nullptr;
4168  B = dyn_cast<BinaryOperator>(B->getOperand(0));
4169  }
4170  }
4171 
4172  if (!B)
4173  return false;
4174 
4175  Type *Ty = B->getType();
4176  if (!isValidElementType(Ty))
4177  return false;
4178 
4179  const DataLayout &DL = B->getModule()->getDataLayout();
4180  ReductionOpcode = B->getOpcode();
4181  ReducedValueOpcode = 0;
4182  // FIXME: Register size should be a parameter to this function, so we can
4183  // try different vectorization factors.
4184  ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
4185  ReductionRoot = B;
4186  ReductionPHI = Phi;
4187 
4188  if (ReduxWidth < 4)
4189  return false;
4190 
4191  // We currently only support adds.
4192  if (ReductionOpcode != Instruction::Add &&
4193  ReductionOpcode != Instruction::FAdd)
4194  return false;
4195 
4196  // Post order traverse the reduction tree starting at B. We only handle true
4197  // trees containing only binary operators or selects.
4199  Stack.push_back(std::make_pair(B, 0));
4200  while (!Stack.empty()) {
4201  Instruction *TreeN = Stack.back().first;
4202  unsigned EdgeToVist = Stack.back().second++;
4203  bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
4204 
4205  // Only handle trees in the current basic block.
4206  if (TreeN->getParent() != B->getParent())
4207  return false;
4208 
4209  // Each tree node needs to have one user except for the ultimate
4210  // reduction.
4211  if (!TreeN->hasOneUse() && TreeN != B)
4212  return false;
4213 
4214  // Postorder vist.
4215  if (EdgeToVist == 2 || IsReducedValue) {
4216  if (IsReducedValue) {
4217  // Make sure that the opcodes of the operations that we are going to
4218  // reduce match.
4219  if (!ReducedValueOpcode)
4220  ReducedValueOpcode = TreeN->getOpcode();
4221  else if (ReducedValueOpcode != TreeN->getOpcode())
4222  return false;
4223  ReducedVals.push_back(TreeN);
4224  } else {
4225  // We need to be able to reassociate the adds.
4226  if (!TreeN->isAssociative())
4227  return false;
4228  ReductionOps.push_back(TreeN);
4229  }
4230  // Retract.
4231  Stack.pop_back();
4232  continue;
4233  }
4234 
4235  // Visit left or right.
4236  Value *NextV = TreeN->getOperand(EdgeToVist);
4237  if (NextV != Phi) {
4238  auto *I = dyn_cast<Instruction>(NextV);
4239  // Continue analysis if the next operand is a reduction operation or
4240  // (possibly) a reduced value. If the reduced value opcode is not set,
4241  // the first met operation != reduction operation is considered as the
4242  // reduced value class.
4243  if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
4244  I->getOpcode() == ReductionOpcode)) {
4245  if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
4246  ReducedValueOpcode = I->getOpcode();
4247  Stack.push_back(std::make_pair(I, 0));
4248  continue;
4249  }
4250  return false;
4251  }
4252  }
4253  return true;
4254  }
4255 
4256  /// \brief Attempt to vectorize the tree found by
4257  /// matchAssociativeReduction.
4258  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
4259  if (ReducedVals.empty())
4260  return false;
4261 
4262  unsigned NumReducedVals = ReducedVals.size();
4263  if (NumReducedVals < ReduxWidth)
4264  return false;
4265 
4266  Value *VectorizedTree = nullptr;
4267  IRBuilder<> Builder(ReductionRoot);
4268  FastMathFlags Unsafe;
4269  Unsafe.setUnsafeAlgebra();
4270  Builder.setFastMathFlags(Unsafe);
4271  unsigned i = 0;
4272 
4273  for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
4274  auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
4275  V.buildTree(VL, ReductionOps);
4276  if (V.shouldReorder()) {
4277  SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
4278  V.buildTree(Reversed, ReductionOps);
4279  }
4281  continue;
4282 
4284 
4285  // Estimate cost.
4286  int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
4287  if (Cost >= -SLPCostThreshold)
4288  break;
4289 
4290  DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
4291  << ". (HorRdx)\n");
4292 
4293  // Vectorize a tree.
4294  DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
4295  Value *VectorizedRoot = V.vectorizeTree();
4296 
4297  // Emit a reduction.
4298  Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
4299  if (VectorizedTree) {
4300  Builder.SetCurrentDebugLocation(Loc);
4301  VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
4302  ReducedSubTree, "bin.rdx");
4303  } else
4304  VectorizedTree = ReducedSubTree;
4305  }
4306 
4307  if (VectorizedTree) {
4308  // Finish the reduction.
4309  for (; i < NumReducedVals; ++i) {
4310  Builder.SetCurrentDebugLocation(
4311  cast<Instruction>(ReducedVals[i])->getDebugLoc());
4312  VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
4313  ReducedVals[i]);
4314  }
4315  // Update users.
4316  if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
4317  assert(ReductionRoot && "Need a reduction operation");
4318  ReductionRoot->setOperand(0, VectorizedTree);
4319  ReductionRoot->setOperand(1, ReductionPHI);
4320  } else
4321  ReductionRoot->replaceAllUsesWith(VectorizedTree);
4322  }
4323  return VectorizedTree != nullptr;
4324  }
4325 
4326  unsigned numReductionValues() const {
4327  return ReducedVals.size();
4328  }
4329 
4330 private:
4331  /// \brief Calculate the cost of a reduction.
4332  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
4333  Type *ScalarTy = FirstReducedVal->getType();
4334  Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
4335 
4336  int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
4337  int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
4338 
4339  IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
4340  int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
4341 
4342  int ScalarReduxCost =
4343  (ReduxWidth - 1) *
4344  TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
4345 
4346  DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
4347  << " for reduction that starts with " << *FirstReducedVal
4348  << " (It is a "
4349  << (IsPairwiseReduction ? "pairwise" : "splitting")
4350  << " reduction)\n");
4351 
4352  return VecReduxCost - ScalarReduxCost;
4353  }
4354 
4355  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
4356  Value *R, const Twine &Name = "") {
4357  if (Opcode == Instruction::FAdd)
4358  return Builder.CreateFAdd(L, R, Name);
4359  return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
4360  }
4361 
4362  /// \brief Emit a horizontal reduction of the vectorized value.
4363  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
4364  assert(VectorizedValue && "Need to have a vectorized tree node");
4365  assert(isPowerOf2_32(ReduxWidth) &&
4366  "We only handle power-of-two reductions for now");
4367 
4368  Value *TmpVec = VectorizedValue;
4369  for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
4370  if (IsPairwiseReduction) {
4371  Value *LeftMask =
4372  createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
4373  Value *RightMask =
4374  createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
4375 
4376  Value *LeftShuf = Builder.CreateShuffleVector(
4377  TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
4378  Value *RightShuf = Builder.CreateShuffleVector(
4379  TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
4380  "rdx.shuf.r");
4381  TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
4382  "bin.rdx");
4383  } else {
4384  Value *UpperHalf =
4385  createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
4386  Value *Shuf = Builder.CreateShuffleVector(
4387  TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
4388  TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
4389  }
4390  }
4391 
4392  // The result is in the first element of the vector.
4393  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
4394  }
4395 };
4396 } // end anonymous namespace
4397 
4398 /// \brief Recognize construction of vectors like
4399 /// %ra = insertelement <4 x float> undef, float %s0, i32 0
4400 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
4401 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
4402 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
4403 ///
4404 /// Returns true if it matches
4405 ///
4406 static bool findBuildVector(InsertElementInst *FirstInsertElem,
4407  SmallVectorImpl<Value *> &BuildVector,
4408  SmallVectorImpl<Value *> &BuildVectorOpds) {
4409  if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
4410  return false;
4411 
4412  InsertElementInst *IE = FirstInsertElem;
4413  while (true) {
4414  BuildVector.push_back(IE);
4415  BuildVectorOpds.push_back(IE->getOperand(1));
4416 
4417  if (IE->use_empty())
4418  return false;
4419 
4421  if (!NextUse)
4422  return true;
4423 
4424  // If this isn't the final use, make sure the next insertelement is the only
4425  // use. It's OK if the final constructed vector is used multiple times
4426  if (!IE->hasOneUse())
4427  return false;
4428 
4429  IE = NextUse;
4430  }
4431 
4432  return false;
4433 }
4434 
4435 /// \brief Like findBuildVector, but looks backwards for construction of aggregate.
4436 ///
4437 /// \return true if it matches.
4439  SmallVectorImpl<Value *> &BuildVector,
4440  SmallVectorImpl<Value *> &BuildVectorOpds) {
4441  if (!IV->hasOneUse())
4442  return false;
4443  Value *V = IV->getAggregateOperand();
4444  if (!isa<UndefValue>(V)) {
4446  if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
4447  return false;
4448  }
4449  BuildVector.push_back(IV);
4450  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
4451  return true;
4452 }
4453 
4454 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
4455  return V->getType() < V2->getType();
4456 }
4457 
4458 /// \brief Try and get a reduction value from a phi node.
4459 ///
4460 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
4461 /// if they come from either \p ParentBB or a containing loop latch.
4462 ///
4463 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
4464 /// if not possible.
4466  BasicBlock *ParentBB, LoopInfo *LI) {
4467  // There are situations where the reduction value is not dominated by the
4468  // reduction phi. Vectorizing such cases has been reported to cause
4469  // miscompiles. See PR25787.
4470  auto DominatedReduxValue = [&](Value *R) {
4471  return (
4472  dyn_cast<Instruction>(R) &&
4473  DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
4474  };
4475 
4476  Value *Rdx = nullptr;
4477 
4478  // Return the incoming value if it comes from the same BB as the phi node.
4479  if (P->getIncomingBlock(0) == ParentBB) {
4480  Rdx = P->getIncomingValue(0);
4481  } else if (P->getIncomingBlock(1) == ParentBB) {
4482  Rdx = P->getIncomingValue(1);
4483  }
4484 
4485  if (Rdx && DominatedReduxValue(Rdx))
4486  return Rdx;
4487 
4488  // Otherwise, check whether we have a loop latch to look at.
4489  Loop *BBL = LI->getLoopFor(ParentBB);
4490  if (!BBL)
4491  return nullptr;
4492  BasicBlock *BBLatch = BBL->getLoopLatch();
4493  if (!BBLatch)
4494  return nullptr;
4495 
4496  // There is a loop latch, return the incoming value if it comes from
4497  // that. This reduction pattern occasionally turns up.
4498  if (P->getIncomingBlock(0) == BBLatch) {
4499  Rdx = P->getIncomingValue(0);
4500  } else if (P->getIncomingBlock(1) == BBLatch) {
4501  Rdx = P->getIncomingValue(1);
4502  }
4503 
4504  if (Rdx && DominatedReduxValue(Rdx))
4505  return Rdx;
4506 
4507  return nullptr;
4508 }
4509 
4510 /// \brief Attempt to reduce a horizontal reduction.
4511 /// If it is legal to match a horizontal reduction feeding
4512 /// the phi node P with reduction operators BI, then check if it
4513 /// can be done.
4514 /// \returns true if a horizontal reduction was matched and reduced.
4515 /// \returns false if a horizontal reduction was not matched.
4517  BoUpSLP &R, TargetTransformInfo *TTI,
4518  unsigned MinRegSize) {
4519  if (!ShouldVectorizeHor)
4520  return false;
4521 
4522  HorizontalReduction HorRdx(MinRegSize);
4523  if (!HorRdx.matchAssociativeReduction(P, BI))
4524  return false;
4525 
4526  // If there is a sufficient number of reduction values, reduce
4527  // to a nearby power-of-2. Can safely generate oversized
4528  // vectors and rely on the backend to split them to legal sizes.
4529  HorRdx.ReduxWidth =
4530  std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
4531 
4532  return HorRdx.tryToReduce(R, TTI);
4533 }
4534 
4535 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
4536  bool Changed = false;
4537  SmallVector<Value *, 4> Incoming;
4538  SmallSet<Value *, 16> VisitedInstrs;
4539 
4540  bool HaveVectorizedPhiNodes = true;
4541  while (HaveVectorizedPhiNodes) {
4542  HaveVectorizedPhiNodes = false;
4543 
4544  // Collect the incoming values from the PHIs.
4545  Incoming.clear();
4546  for (Instruction &I : *BB) {
4547  PHINode *P = dyn_cast<PHINode>(&I);
4548  if (!P)
4549  break;
4550 
4551  if (!VisitedInstrs.count(P))
4552  Incoming.push_back(P);
4553  }
4554 
4555  // Sort by type.
4556  std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
4557 
4558  // Try to vectorize elements base on their type.
4559  for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
4560  E = Incoming.end();
4561  IncIt != E;) {
4562 
4563  // Look for the next elements with the same type.
4564  SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
4565  while (SameTypeIt != E &&
4566  (*SameTypeIt)->getType() == (*IncIt)->getType()) {
4567  VisitedInstrs.insert(*SameTypeIt);
4568  ++SameTypeIt;
4569  }
4570 
4571  // Try to vectorize them.
4572  unsigned NumElts = (SameTypeIt - IncIt);
4573  DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
4574  if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
4575  // Success start over because instructions might have been changed.
4576  HaveVectorizedPhiNodes = true;
4577  Changed = true;
4578  break;
4579  }
4580 
4581  // Start over at the next instruction of a different type (or the end).
4582  IncIt = SameTypeIt;
4583  }
4584  }
4585 
4586  VisitedInstrs.clear();
4587 
4588  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
4589  // We may go through BB multiple times so skip the one we have checked.
4590  if (!VisitedInstrs.insert(&*it).second)
4591  continue;
4592 
4593  if (isa<DbgInfoIntrinsic>(it))
4594  continue;
4595 
4596  // Try to vectorize reductions that use PHINodes.
4597  if (PHINode *P = dyn_cast<PHINode>(it)) {
4598  // Check that the PHI is a reduction PHI.
4599  if (P->getNumIncomingValues() != 2)
4600  return Changed;
4601 
4602  Value *Rdx = getReductionValue(DT, P, BB, LI);
4603 
4604  // Check if this is a Binary Operator.
4605  BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
4606  if (!BI)
4607  continue;
4608 
4609  // Try to match and vectorize a horizontal reduction.
4610  if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
4611  Changed = true;
4612  it = BB->begin();
4613  e = BB->end();
4614  continue;
4615  }
4616 
4617  Value *Inst = BI->getOperand(0);
4618  if (Inst == P)
4619  Inst = BI->getOperand(1);
4620 
4621  if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
4622  // We would like to start over since some instructions are deleted
4623  // and the iterator may become invalid value.
4624  Changed = true;
4625  it = BB->begin();
4626  e = BB->end();
4627  continue;
4628  }
4629 
4630  continue;
4631  }
4632 
4634  if (StoreInst *SI = dyn_cast<StoreInst>(it))
4635  if (BinaryOperator *BinOp =
4636  dyn_cast<BinaryOperator>(SI->getValueOperand())) {
4637  if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
4638  R.getMinVecRegSize()) ||
4639  tryToVectorize(BinOp, R)) {
4640  Changed = true;
4641  it = BB->begin();
4642  e = BB->end();
4643  continue;
4644  }
4645  }
4646 
4647  // Try to vectorize horizontal reductions feeding into a return.
4648  if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
4649  if (RI->getNumOperands() != 0)
4650  if (BinaryOperator *BinOp =
4651  dyn_cast<BinaryOperator>(RI->getOperand(0))) {
4652  DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
4653  if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
4654  R.getMinVecRegSize()) ||
4655  tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
4656  R)) {
4657  Changed = true;
4658  it = BB->begin();
4659  e = BB->end();
4660  continue;
4661  }
4662  }
4663 
4664  // Try to vectorize trees that start at compare instructions.
4665  if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
4666  if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
4667  Changed = true;
4668  // We would like to start over since some instructions are deleted
4669  // and the iterator may become invalid value.
4670  it = BB->begin();
4671  e = BB->end();
4672  continue;
4673  }
4674 
4675  for (int i = 0; i < 2; ++i) {
4676  if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
4677  if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
4678  Changed = true;
4679  // We would like to start over since some instructions are deleted
4680  // and the iterator may become invalid value.
4681  it = BB->begin();
4682  e = BB->end();
4683  break;
4684  }
4685  }
4686  }
4687  continue;
4688  }
4689 
4690  // Try to vectorize trees that start at insertelement instructions.
4691  if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
4692  SmallVector<Value *, 16> BuildVector;
4693  SmallVector<Value *, 16> BuildVectorOpds;
4694  if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
4695  continue;
4696 
4697  // Vectorize starting with the build vector operands ignoring the
4698  // BuildVector instructions for the purpose of scheduling and user
4699  // extraction.
4700  if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
4701  Changed = true;
4702  it = BB->begin();
4703  e = BB->end();
4704  }
4705 
4706  continue;
4707  }
4708 
4709  // Try to vectorize trees that start at insertvalue instructions feeding into
4710  // a store.
4711  if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
4712  if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
4713  const DataLayout &DL = BB->getModule()->getDataLayout();
4714  if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
4715  SmallVector<Value *, 16> BuildVector;
4716  SmallVector<Value *, 16> BuildVectorOpds;
4717  if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
4718  continue;
4719 
4720  DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
4721  if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
4722  Changed = true;
4723  it = BB->begin();
4724  e = BB->end();
4725  }
4726  continue;
4727  }
4728  }
4729  }
4730  }
4731 
4732  return Changed;
4733 }
4734 
4735 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
4736  auto Changed = false;
4737  for (auto &Entry : GEPs) {
4738 
4739  // If the getelementptr list has fewer than two elements, there's nothing
4740  // to do.
4741  if (Entry.second.size() < 2)
4742  continue;
4743 
4744  DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
4745  << Entry.second.size() << ".\n");
4746 
4747  // We process the getelementptr list in chunks of 16 (like we do for
4748  // stores) to minimize compile-time.
4749  for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
4750  auto Len = std::min<unsigned>(BE - BI, 16);
4751  auto GEPList = makeArrayRef(&Entry.second[BI], Len);
4752 
4753  // Initialize a set a candidate getelementptrs. Note that we use a
4754  // SetVector here to preserve program order. If the index computations
4755  // are vectorizable and begin with loads, we want to minimize the chance
4756  // of having to reorder them later.
4757  SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
4758 
4759  // Some of the candidates may have already been vectorized after we
4760  // initially collected them. If so, the WeakVHs will have nullified the
4761  // values, so remove them from the set of candidates.
4762  Candidates.remove(nullptr);
4763 
4764  // Remove from the set of candidates all pairs of getelementptrs with
4765  // constant differences. Such getelementptrs are likely not good
4766  // candidates for vectorization in a bottom-up phase since one can be
4767  // computed from the other. We also ensure all candidate getelementptr
4768  // indices are unique.
4769  for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
4770  auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
4771  if (!Candidates.count(GEPI))
4772  continue;
4773  auto *SCEVI = SE->getSCEV(GEPList[I]);
4774  for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
4775  auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
4776  auto *SCEVJ = SE->getSCEV(GEPList[J]);
4777  if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
4778  Candidates.remove(GEPList[I]);
4779  Candidates.remove(GEPList[J]);
4780  } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
4781  Candidates.remove(GEPList[J]);
4782  }
4783  }
4784  }
4785 
4786  // We break out of the above computation as soon as we know there are
4787  // fewer than two candidates remaining.
4788  if (Candidates.size() < 2)
4789  continue;
4790 
4791  // Add the single, non-constant index of each candidate to the bundle. We
4792  // ensured the indices met these constraints when we originally collected
4793  // the getelementptrs.
4794  SmallVector<Value *, 16> Bundle(Candidates.size());
4795  auto BundleIndex = 0u;
4796  for (auto *V : Candidates) {
4797  auto *GEP = cast<GetElementPtrInst>(V);
4798  auto *GEPIdx = GEP->idx_begin()->get();
4799  assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
4800  Bundle[BundleIndex++] = GEPIdx;
4801  }
4802 
4803  // Try and vectorize the indices. We are currently only interested in
4804  // gather-like cases of the form:
4805  //
4806  // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
4807  //
4808  // where the loads of "a", the loads of "b", and the subtractions can be
4809  // performed in parallel. It's likely that detecting this pattern in a
4810  // bottom-up phase will be simpler and less costly than building a
4811  // full-blown top-down phase beginning at the consecutive loads.
4812  Changed |= tryToVectorizeList(Bundle, R);
4813  }
4814  }
4815  return Changed;
4816 }
4817 
4818 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
4819  bool Changed = false;
4820  // Attempt to sort and vectorize each of the store-groups.
4821  for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
4822  ++it) {
4823  if (it->second.size() < 2)
4824  continue;
4825 
4826  DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
4827  << it->second.size() << ".\n");
4828 
4829  // Process the stores in chunks of 16.
4830  // TODO: The limit of 16 inhibits greater vectorization factors.
4831  // For example, AVX2 supports v32i8. Increasing this limit, however,
4832  // may cause a significant compile-time increase.
4833  for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
4834  unsigned Len = std::min<unsigned>(CE - CI, 16);
4835  Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
4836  }
4837  }
4838  return Changed;
4839 }
4840 
4841 char SLPVectorizer::ID = 0;
4842 static const char lv_name[] = "SLP Vectorizer";
4843 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
4848 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4850 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
4851 
4852 namespace llvm {
4853 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
4854 }
Legacy wrapper pass to provide the GlobalsAAResult object.
MachineLoop * L
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:81
static unsigned getSameOpcode(ArrayRef< Value * > VL)
Return a value (possibly void), from a function.
Value * getValueOperand()
Definition: Instructions.h:391
void push_back(const T &Elt)
Definition: SmallVector.h:211
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:840
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1554
static bool canCombineAsAltInst(unsigned Op)
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:870
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
This instruction extracts a struct member or array element value from an aggregate value...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
SmallPtrSet< Value *, 16 > ValueSet
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:226
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1032
Value * getAggregateOperand()
void dropAllReferences()
Drop all references to operands.
Definition: User.h:269
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:274
STATISTIC(NumFunctions,"Total number of functions")
bool hasValue() const
Definition: Optional.h:125
size_t i
This is the interface for a simple mod/ref and alias analysis over globals.
static bool allSameBlock(ArrayRef< Value * > VL)
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:52
bool hasNUses(unsigned N) const
Return true if this Value has exactly N users.
Definition: Value.cpp:99
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:144
static Value * getReductionValue(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction value from a phi node.
unsigned getNumOperands() const
Definition: User.h:167
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
The main scalar evolution driver.
iterator end() const
Definition: ArrayRef.h:130
bool isSimple() const
Definition: Instructions.h:384
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:84
ilist_iterator< OptionsT,!IsReverse, IsConst > getReverse() const
Get a reverse iterator to the same node.
This class represents a function call, abstracting a target machine's calling convention.
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:380
An immutable pass that tracks lazily created AssumptionCache objects.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:655
static const unsigned MaxMemDepDistance
A cache of .assume calls within a function.
Analysis pass providing the TargetTransformInfo.
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this ""number "))
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:32
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:736
static bool findBuildVector(InsertElementInst *FirstInsertElem, SmallVectorImpl< Value * > &BuildVector, SmallVectorImpl< Value * > &BuildVectorOpds)
Recognize construction of vectors like ra = insertelement <4 x float> undef, float s0...
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1304
unsigned getNumberOfParts(Type *Tp) const
A debug info location.
Definition: DebugLoc.h:34
const Instruction & front() const
Definition: BasicBlock.h:240
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:189
reverse_iterator rend()
Definition: BasicBlock.h:235
An instruction for reading from memory.
Definition: Instructions.h:164
reverse_iterator rbegin()
Definition: BasicBlock.h:233
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:905
Hexagon Common GEP
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:575
#define op(i)
void reserve(size_type N)
Definition: SmallVector.h:377
bool isSimple() const
Definition: Instructions.h:263
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
op_iterator op_begin()
Definition: User.h:205
uint64_t getTypeAllocSizeInBits(Type *Ty) const
Returns the offset in bits between successive objects of the specified type, including alignment padd...
Definition: DataLayout.h:418
static bool hasValueBeenRAUWed(ArrayRef< Value * > VL, ArrayRef< WeakVH > VH, unsigned SliceBegin, unsigned SliceSize)
Check that the Values in the slice in VL array are still existent in the WeakVH array.
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:191
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
Definition: VectorUtils.cpp:86
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:157
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:228
SmallVector< Instruction *, 16 > InstrList
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:347
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:670
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Definition: ArrayRef.h:171
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:53
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
This class represents the LLVM 'select' instruction.
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:578
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:143
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:440
Class to represent struct types.
Definition: DerivedTypes.h:199
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
unsigned getNumArgOperands() const
Return the number of call arguments.
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:180
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:994
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:588
unsigned getNumIndices() const
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:90
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Windows NT (Windows on ARM)
static const unsigned AliasedCheckLimit
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:806
static bool allConstant(ArrayRef< Value * > VL)
bool remove(const value_type &X)
Remove an item from the set vector.
Definition: SetVector.h:152
static bool shouldReorderOperands(int i, Instruction &I, SmallVectorImpl< Value * > &Left, SmallVectorImpl< Value * > &Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight)
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:60
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:399
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1094
static bool isSimple(Instruction *I)
Choose alternate elements from vector.
static unsigned getAlignment(GlobalVariable *GV)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1362
#define F(x, y, z)
Definition: MD5.cpp:51
static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, BoUpSLP &R, TargetTransformInfo *TTI, unsigned MinRegSize)
Attempt to reduce a horizontal reduction.
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:136
void clear()
Definition: SmallSet.h:118
static unsigned isAltInst(ArrayRef< Value * > VL)
bool isAssociative() const
Return true if the instruction is associative:
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:134
#define T
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:121
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:646
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:269
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:116
Function Alias Analysis false
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const
Base class for the actual dominator tree node.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
Value * getInsertedValueOperand()
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:949
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_)
static Value * createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, bool IsPairwise, bool IsLeft, IRBuilder<> &Builder)
Generate a shuffle mask to be used in a reduction tree.
static GCRegistry::Add< OcamlGC > B("ocaml","ocaml 3.10-compatible GC")
An instruction for storing to memory.
Definition: Instructions.h:300
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:151
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:401
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1301
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:96
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:157
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1341
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:141
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:127
Class to represent pointers.
Definition: DerivedTypes.h:443
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl< Value * > &Expr, SmallVectorImpl< Value * > &ToDemote, SmallVectorImpl< Value * > &Roots)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:416
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
unsigned getNumIncomingValues() const
Return the number of incoming edges.
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:24
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Definition: InstrTypes.h:1441
Value * CreateFCmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1561
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef< Type * > Tys, FastMathFlags FMF) const
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
LoadInst * CreateLoad(Value *Ptr, const char *Name)
Definition: IRBuilder.h:1082
#define P(N)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
This instruction inserts a single (scalar) element into a VectorType value.
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled...
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:348
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Subclasses of this class are all able to terminate a basic block.
Definition: InstrTypes.h:52
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:107
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:109
constexpr bool isPowerOf2_32(uint32_t Value)
isPowerOf2_32 - This function returns true if the argument is a power of two > 0. ...
Definition: MathExtras.h:399
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs...ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:653
std::enable_if<!std::is_array< T >::value, std::unique_ptr< T > >::type make_unique(Args &&...args)
Constructs a new T() with the given args and returns a unique_ptr<T> which owns the object...
Definition: STLExtras.h:845
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
size_type size() const
Definition: SmallPtrSet.h:99
uint64_t getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:399
SmallVector< Value *, 8 > ValueList
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:115
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:36
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.h:1609
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
A manager for alias analyses.
APInt Or(const APInt &LHS, const APInt &RHS)
Bitwise OR function for APInt.
Definition: APInt.h:1947
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
APInt Xor(const APInt &LHS, const APInt &RHS)
Bitwise XOR function for APInt.
Definition: APInt.h:1952
unsigned getVectorElementSize(Value *V)
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:259
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const
Calculate the cost of performing a vector reduction.
void setUnsafeAlgebra()
Definition: Operator.h:205
Represent the analysis usage information of a pass.
constexpr bool isPowerOf2_64(uint64_t Value)
isPowerOf2_64 - This function returns true if the argument is a power of two 0 (64 bit edition...
Definition: MathExtras.h:405
static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl< Value * > &BuildVector, SmallVectorImpl< Value * > &BuildVectorOpds)
Like findBuildVector, but looks backwards for construction of aggregate.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:109
void buildTree(ArrayRef< Value * > Roots, ArrayRef< Value * > UserIgnoreLst=None)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang","erlang-compatible garbage collector")
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
Analysis pass providing a never-invalidated alias analysis result.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:880
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
iterator begin() const
Definition: ArrayRef.h:129
uint64_t getNumElements() const
Definition: DerivedTypes.h:335
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
Value * getOperand(unsigned i) const
Definition: User.h:145
op_range operands()
Definition: User.h:213
Value * getPointerOperand()
Definition: Instructions.h:270
bool isCommutative() const
Return true if the instruction is commutative:
Definition: Instruction.h:385
iterator_range< po_iterator< T > > post_order(const T &G)
self_iterator getIterator()
Definition: ilist_node.h:81
Class to represent integer types.
Definition: DerivedTypes.h:39
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:80
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:960
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:1629
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:136
void setAlignment(unsigned Align)
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
unsigned getMinVecRegSize() const
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1337
uint64_t NextPowerOf2(uint64_t A)
NextPowerOf2 - Returns the next power of two (in 64-bits) that is strictly greater than A...
Definition: MathExtras.h:619
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:113
int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index=0, Type *SubTp=nullptr) const
const Value * getTrueValue() const
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:391
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1298
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:150
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:3540
OperandValueProperties
Additional properties of an operand's values.
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:218
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:64
const Value * Ptr
The address of the start of the location.
#define SV_NAME
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:1574
Representation for a specific memory location.
A function analysis which provides an AssumptionCache.
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
BinaryOps getOpcode() const
Definition: InstrTypes.h:541
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:689
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:234
This is the common base class for memset/memcpy/memmove.
Iterator for intrusive lists based on ilist_node.
Value * CreateFAdd(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:827
This is the shared class of boolean and integer constants.
Definition: Constants.h:88
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Return the number of times the sign bit of the register is replicated into the other bits...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Definition: SmallPtrSet.h:375
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
iterator end()
Definition: BasicBlock.h:230
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:58
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:59
Value * CreateInsertElement(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:1642
Provides information about what library functions are available for the current target.
static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode)
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:76
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1344
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:50
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:382
SmallVector< StoreInst *, 8 > StoreList
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:307
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:1656
Function * getCalledFunction() const
Return the function called, or null if this is an indirect function invocation.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:276
const BasicBlock & getEntryBlock() const
Definition: Function.h:519
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:275
bool hasVectorInstrinsicScalarOpd(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the intrinsic has a scalar operand.
Definition: VectorUtils.cpp:71
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
Intrinsic::ID getIntrinsicID() const LLVM_READONLY
getIntrinsicID - This method returns the ID number of the specified function, or Intrinsic::not_intri...
Definition: Function.h:146
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
void setOperand(unsigned i, Value *Val)
Definition: User.h:150
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:586
Value * getArgOperand(unsigned i) const
getArgOperand/setArgOperand - Return/set the i-th call argument.
Value * CreateGEP(Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1141
Class to represent vector types.
Definition: DerivedTypes.h:369
bool hasIdenticalOperandBundleSchema(const OperandBundleUser< InstrTy, OpIteratorTy > &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:1490
Value * getIncomingValueForBlock(const BasicBlock *BB) const
iterator_range< user_iterator > users()
Definition: Value.h:370
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:191
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1942
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:72
Analysis pass that exposes the ScalarEvolution for a function.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:332
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:384
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:119
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:151
void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Determine whether the sign bit is known to be zero or one.
void initializeSLPVectorizerPass(PassRegistry &)
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:205
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:226
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:368
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:227
static bool runImpl(CallGraphSCC &SCC, CallGraph &CG, function_ref< AAResults &(Function &F)> AARGetter, unsigned MaxElements)
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:124
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:135
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:383
static void propagateIRFlags(Value *I, ArrayRef< Value * > VL)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
CallInst * CreateCall(Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1579
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:4430
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
unsigned getMaxVecRegSize() const
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
void preserve()
Mark an analysis as preserved.
Definition: PassManager.h:120
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1350
static bool isSplat(ArrayRef< Value * > VL)
Pass * createSLPVectorizerPass()
unsigned canMapToVector(Type *T, const DataLayout &DL) const
Check if ArrayType or StructType is isomorphic to some VectorType.
Analysis pass providing the TargetLibraryInfo.
int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index=-1) const
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA)
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1394
bool use_empty() const
Definition: Value.h:299
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const char lv_name[]
static bool PhiTypeSorterFunc(Value *V, Value *V2)
LoadInst * CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name)
Definition: IRBuilder.h:1100
uint64_t PowerOf2Floor(uint64_t A)
Returns the power of two which is less than or equal to the given value.
Definition: MathExtras.h:631
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:537
LLVM Value Representation.
Definition: Value.h:71
void setAlignment(unsigned Align)
vector_type::const_iterator iterator
Definition: SetVector.h:49
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:111
A vector that has set insertion semantics.
Definition: SetVector.h:41
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:631
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:81
static const Function * getParent(const Value *V)
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:95
Broadcast element 0 to all other elements.
unsigned getNumberOfRegisters(bool Vector) const
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:533
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:44
#define DEBUG(X)
Definition: Debug.h:100
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:831
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:678
IRTranslator LLVM IR MI
const Value * getFalseValue() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:168
OperandValueKind
Additional information about an operand's possible values.
A container for analyses that lazily runs them and caches their results.
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index=-1) const
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:217
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
idx_iterator idx_begin() const
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree...
int * Ptr
DomTreeNodeBase< NodeT > * getNode(NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop)...
Definition: CodeMetrics.cpp:73
op_range incoming_values()
Bottom Up SLP Vectorizer.
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
unsigned getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
Value * getPointerOperand()
Definition: Instructions.h:394
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy=nullptr) const
const BasicBlock * getParent() const
Definition: Instruction.h:62
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:83
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL)
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info=OK_AnyValue, OperandValueKind Opd2Info=OK_AnyValue, OperandValueProperties Opd1PropInfo=OP_None, OperandValueProperties Opd2PropInfo=OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >()) const
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal].
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
This instruction inserts a struct field of array element value into an aggregate value.
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:34
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:783
static bool allSameType(ArrayRef< Value * > VL)
static unsigned getAltOpcode(unsigned Op)