LLVM  8.0.0svn
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass merges loads/stores to/from sequential memory addresses into vector
11 // loads/stores. Although there's nothing GPU-specific in here, this pass is
12 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 //
14 // (For simplicity below we talk about loads only, but everything also applies
15 // to stores.)
16 //
17 // This pass is intended to be run late in the pipeline, after other
18 // vectorization opportunities have been exploited. So the assumption here is
19 // that immediately following our new vector load we'll need to extract out the
20 // individual elements of the load, so we can operate on them individually.
21 //
22 // On CPUs this transformation is usually not beneficial, because extracting the
23 // elements of a vector register is expensive on most architectures. It's
24 // usually better just to load each element individually into its own scalar
25 // register.
26 //
27 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
28 // "vector load" loads directly into a series of scalar registers. In effect,
29 // extracting the elements of the vector is free. It's therefore always
30 // beneficial to vectorize a sequence of loads on these architectures.
31 //
32 // Vectorizing (perhaps a better name might be "coalescing") loads can have
33 // large performance impacts on GPU kernels, and opportunities for vectorizing
34 // are common in GPU code. This pass tries very hard to find such
35 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 //
37 // Some CPU architectures, such as ARM, have instructions that load into
38 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
39 // could use this pass (with some modifications), but currently it implements
40 // its own pass to do something similar to what we do here.
41 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
59 #include "llvm/IR/Attributes.h"
60 #include "llvm/IR/BasicBlock.h"
61 #include "llvm/IR/Constants.h"
62 #include "llvm/IR/DataLayout.h"
63 #include "llvm/IR/DerivedTypes.h"
64 #include "llvm/IR/Dominators.h"
65 #include "llvm/IR/Function.h"
66 #include "llvm/IR/IRBuilder.h"
67 #include "llvm/IR/InstrTypes.h"
68 #include "llvm/IR/Instruction.h"
69 #include "llvm/IR/Instructions.h"
70 #include "llvm/IR/IntrinsicInst.h"
71 #include "llvm/IR/Module.h"
72 #include "llvm/IR/Type.h"
73 #include "llvm/IR/User.h"
74 #include "llvm/IR/Value.h"
75 #include "llvm/Pass.h"
76 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/KnownBits.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
106 /// be relied upon.
107 using ChainID = const Value *;
108 using InstrList = SmallVector<Instruction *, 8>;
109 using InstrListMap = MapVector<ChainID, InstrList>;
110 
111 class Vectorizer {
112  Function &F;
113  AliasAnalysis &AA;
114  DominatorTree &DT;
115  ScalarEvolution &SE;
116  TargetTransformInfo &TTI;
117  const DataLayout &DL;
118  IRBuilder<> Builder;
119 
120 public:
121  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
123  : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
124  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
125 
126  bool run();
127 
128 private:
129  unsigned getPointerAddressSpace(Value *I);
130 
131  unsigned getAlignment(LoadInst *LI) const {
132  unsigned Align = LI->getAlignment();
133  if (Align != 0)
134  return Align;
135 
136  return DL.getABITypeAlignment(LI->getType());
137  }
138 
139  unsigned getAlignment(StoreInst *SI) const {
140  unsigned Align = SI->getAlignment();
141  if (Align != 0)
142  return Align;
143 
144  return DL.getABITypeAlignment(SI->getValueOperand()->getType());
145  }
146 
147  static const unsigned MaxDepth = 3;
148 
149  bool isConsecutiveAccess(Value *A, Value *B);
150  bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
151  unsigned Depth = 0) const;
152  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
153  unsigned Depth) const;
154  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
155  unsigned Depth) const;
156 
157  /// After vectorization, reorder the instructions that I depends on
158  /// (the instructions defining its operands), to ensure they dominate I.
159  void reorder(Instruction *I);
160 
161  /// Returns the first and the last instructions in Chain.
162  std::pair<BasicBlock::iterator, BasicBlock::iterator>
163  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
164 
165  /// Erases the original instructions after vectorizing.
166  void eraseInstructions(ArrayRef<Instruction *> Chain);
167 
168  /// "Legalize" the vector type that would be produced by combining \p
169  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171  /// expected to have more than 4 elements.
172  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
173  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
174 
175  /// Finds the largest prefix of Chain that's vectorizable, checking for
176  /// intervening instructions which may affect the memory accessed by the
177  /// instructions within Chain.
178  ///
179  /// The elements of \p Chain must be all loads or all stores and must be in
180  /// address order.
181  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
182 
183  /// Collects load and store instructions to vectorize.
184  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
185 
186  /// Processes the collected instructions, the \p Map. The values of \p Map
187  /// should be all loads or all stores.
188  bool vectorizeChains(InstrListMap &Map);
189 
190  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
192 
193  /// Vectorizes the load instructions in Chain.
194  bool
195  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
196  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
197 
198  /// Vectorizes the store instructions in Chain.
199  bool
200  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
201  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
202 
203  /// Check if this load/store access is misaligned accesses.
204  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
205  unsigned Alignment);
206 };
207 
208 class LoadStoreVectorizer : public FunctionPass {
209 public:
210  static char ID;
211 
212  LoadStoreVectorizer() : FunctionPass(ID) {
214  }
215 
216  bool runOnFunction(Function &F) override;
217 
218  StringRef getPassName() const override {
219  return "GPU Load and Store Vectorizer";
220  }
221 
222  void getAnalysisUsage(AnalysisUsage &AU) const override {
227  AU.setPreservesCFG();
228  }
229 };
230 
231 } // end anonymous namespace
232 
233 char LoadStoreVectorizer::ID = 0;
234 
235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
236  "Vectorize load and Store instructions", false, false)
242 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
243  "Vectorize load and store instructions", false, false)
244 
246  return new LoadStoreVectorizer();
247 }
248 
249 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
250 // vectors of Instructions.
252  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
253  propagateMetadata(I, VL);
254 }
255 
257  // Don't vectorize when the attribute NoImplicitFloat is used.
258  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
259  return false;
260 
261  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
262  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
263  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
264  TargetTransformInfo &TTI =
265  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
266 
267  Vectorizer V(F, AA, DT, SE, TTI);
268  return V.run();
269 }
270 
271 // Vectorizer Implementation
272 bool Vectorizer::run() {
273  bool Changed = false;
274 
275  // Scan the blocks in the function in post order.
276  for (BasicBlock *BB : post_order(&F)) {
277  InstrListMap LoadRefs, StoreRefs;
278  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
279  Changed |= vectorizeChains(LoadRefs);
280  Changed |= vectorizeChains(StoreRefs);
281  }
282 
283  return Changed;
284 }
285 
286 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
287  if (LoadInst *L = dyn_cast<LoadInst>(I))
288  return L->getPointerAddressSpace();
289  if (StoreInst *S = dyn_cast<StoreInst>(I))
290  return S->getPointerAddressSpace();
291  return -1;
292 }
293 
294 // FIXME: Merge with llvm::isConsecutiveAccess
298  unsigned ASA = getPointerAddressSpace(A);
299  unsigned ASB = getPointerAddressSpace(B);
300 
301  // Check that the address spaces match and that the pointers are valid.
302  if (!PtrA || !PtrB || (ASA != ASB))
303  return false;
304 
305  // Make sure that A and B are different pointers of the same size type.
306  Type *PtrATy = PtrA->getType()->getPointerElementType();
307  Type *PtrBTy = PtrB->getType()->getPointerElementType();
308  if (PtrA == PtrB ||
309  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
310  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
311  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
312  DL.getTypeStoreSize(PtrBTy->getScalarType()))
313  return false;
314 
315  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
316  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
317 
318  return areConsecutivePointers(PtrA, PtrB, Size);
319 }
320 
321 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
322  const APInt &PtrDelta,
323  unsigned Depth) const {
324  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
325  APInt OffsetA(PtrBitWidth, 0);
326  APInt OffsetB(PtrBitWidth, 0);
327  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
328  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
329 
330  APInt OffsetDelta = OffsetB - OffsetA;
331 
332  // Check if they are based on the same pointer. That makes the offsets
333  // sufficient.
334  if (PtrA == PtrB)
335  return OffsetDelta == PtrDelta;
336 
337  // Compute the necessary base pointer delta to have the necessary final delta
338  // equal to the pointer delta requested.
339  APInt BaseDelta = PtrDelta - OffsetDelta;
340 
341  // Compute the distance with SCEV between the base pointers.
342  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
343  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
344  const SCEV *C = SE.getConstant(BaseDelta);
345  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
346  if (X == PtrSCEVB)
347  return true;
348 
349  // The above check will not catch the cases where one of the pointers is
350  // factorized but the other one is not, such as (C + (S * (A + B))) vs
351  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
352  // and getting the simplified difference.
353  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
354  if (C == Dist)
355  return true;
356 
357  // Sometimes even this doesn't work, because SCEV can't always see through
358  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
359  // things the hard way.
360  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
361 }
362 
363 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
364  APInt PtrDelta,
365  unsigned Depth) const {
366  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
367  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
368  if (!GEPA || !GEPB)
369  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
370 
371  // Look through GEPs after checking they're the same except for the last
372  // index.
373  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
374  GEPA->getPointerOperand() != GEPB->getPointerOperand())
375  return false;
376  gep_type_iterator GTIA = gep_type_begin(GEPA);
377  gep_type_iterator GTIB = gep_type_begin(GEPB);
378  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
379  if (GTIA.getOperand() != GTIB.getOperand())
380  return false;
381  ++GTIA;
382  ++GTIB;
383  }
384 
385  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
386  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
387  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
388  OpA->getType() != OpB->getType())
389  return false;
390 
391  if (PtrDelta.isNegative()) {
392  if (PtrDelta.isMinSignedValue())
393  return false;
394  PtrDelta.negate();
395  std::swap(OpA, OpB);
396  }
397  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
398  if (PtrDelta.urem(Stride) != 0)
399  return false;
400  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
401  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
402 
403  // Only look through a ZExt/SExt.
404  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
405  return false;
406 
407  bool Signed = isa<SExtInst>(OpA);
408 
409  // At this point A could be a function parameter, i.e. not an instruction
410  Value *ValA = OpA->getOperand(0);
411  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
412  if (!OpB || ValA->getType() != OpB->getType())
413  return false;
414 
415  // Now we need to prove that adding IdxDiff to ValA won't overflow.
416  bool Safe = false;
417  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
418  // ValA, we're okay.
419  if (OpB->getOpcode() == Instruction::Add &&
420  isa<ConstantInt>(OpB->getOperand(1)) &&
421  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
422  if (Signed)
423  Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
424  else
425  Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
426  }
427 
428  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
429 
430  // Second attempt:
431  // If all set bits of IdxDiff or any higher order bit other than the sign bit
432  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
433  // overflow of any sort.
434  if (!Safe) {
435  OpA = dyn_cast<Instruction>(ValA);
436  if (!OpA)
437  return false;
438  KnownBits Known(BitWidth);
439  computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
440  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
441  if (Signed)
442  BitsAllowedToBeSet.clearBit(BitWidth - 1);
443  if (BitsAllowedToBeSet.ult(IdxDiff))
444  return false;
445  }
446 
447  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
448  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
449  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
450  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
451  return X == OffsetSCEVB;
452 }
453 
454 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
455  const APInt &PtrDelta,
456  unsigned Depth) const {
457  if (Depth++ == MaxDepth)
458  return false;
459 
460  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
461  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
462  return SelectA->getCondition() == SelectB->getCondition() &&
463  areConsecutivePointers(SelectA->getTrueValue(),
464  SelectB->getTrueValue(), PtrDelta, Depth) &&
465  areConsecutivePointers(SelectA->getFalseValue(),
466  SelectB->getFalseValue(), PtrDelta, Depth);
467  }
468  }
469  return false;
470 }
471 
472 void Vectorizer::reorder(Instruction *I) {
473  OrderedBasicBlock OBB(I->getParent());
474  SmallPtrSet<Instruction *, 16> InstructionsToMove;
476 
477  Worklist.push_back(I);
478  while (!Worklist.empty()) {
479  Instruction *IW = Worklist.pop_back_val();
480  int NumOperands = IW->getNumOperands();
481  for (int i = 0; i < NumOperands; i++) {
483  if (!IM || IM->getOpcode() == Instruction::PHI)
484  continue;
485 
486  // If IM is in another BB, no need to move it, because this pass only
487  // vectorizes instructions within one BB.
488  if (IM->getParent() != I->getParent())
489  continue;
490 
491  if (!OBB.dominates(IM, I)) {
492  InstructionsToMove.insert(IM);
493  Worklist.push_back(IM);
494  }
495  }
496  }
497 
498  // All instructions to move should follow I. Start from I, not from begin().
499  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
500  ++BBI) {
501  if (!InstructionsToMove.count(&*BBI))
502  continue;
503  Instruction *IM = &*BBI;
504  --BBI;
505  IM->removeFromParent();
506  IM->insertBefore(I);
507  }
508 }
509 
510 std::pair<BasicBlock::iterator, BasicBlock::iterator>
511 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
512  Instruction *C0 = Chain[0];
513  BasicBlock::iterator FirstInstr = C0->getIterator();
514  BasicBlock::iterator LastInstr = C0->getIterator();
515 
516  BasicBlock *BB = C0->getParent();
517  unsigned NumFound = 0;
518  for (Instruction &I : *BB) {
519  if (!is_contained(Chain, &I))
520  continue;
521 
522  ++NumFound;
523  if (NumFound == 1) {
524  FirstInstr = I.getIterator();
525  }
526  if (NumFound == Chain.size()) {
527  LastInstr = I.getIterator();
528  break;
529  }
530  }
531 
532  // Range is [first, last).
533  return std::make_pair(FirstInstr, ++LastInstr);
534 }
535 
536 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
538  for (Instruction *I : Chain) {
539  Value *PtrOperand = getLoadStorePointerOperand(I);
540  assert(PtrOperand && "Instruction must have a pointer operand.");
541  Instrs.push_back(I);
542  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
543  Instrs.push_back(GEP);
544  }
545 
546  // Erase instructions.
547  for (Instruction *I : Instrs)
548  if (I->use_empty())
549  I->eraseFromParent();
550 }
551 
552 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
553 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
554  unsigned ElementSizeBits) {
555  unsigned ElementSizeBytes = ElementSizeBits / 8;
556  unsigned SizeBytes = ElementSizeBytes * Chain.size();
557  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
558  if (NumLeft == Chain.size()) {
559  if ((NumLeft & 1) == 0)
560  NumLeft /= 2; // Split even in half
561  else
562  --NumLeft; // Split off last element
563  } else if (NumLeft == 0)
564  NumLeft = 1;
565  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
566 }
567 
569 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
570  // These are in BB order, unlike Chain, which is in address order.
571  SmallVector<Instruction *, 16> MemoryInstrs;
572  SmallVector<Instruction *, 16> ChainInstrs;
573 
574  bool IsLoadChain = isa<LoadInst>(Chain[0]);
575  LLVM_DEBUG({
576  for (Instruction *I : Chain) {
577  if (IsLoadChain)
578  assert(isa<LoadInst>(I) &&
579  "All elements of Chain must be loads, or all must be stores.");
580  else
581  assert(isa<StoreInst>(I) &&
582  "All elements of Chain must be loads, or all must be stores.");
583  }
584  });
585 
586  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
587  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
588  if (!is_contained(Chain, &I))
589  MemoryInstrs.push_back(&I);
590  else
591  ChainInstrs.push_back(&I);
592  } else if (isa<IntrinsicInst>(&I) &&
593  cast<IntrinsicInst>(&I)->getIntrinsicID() ==
594  Intrinsic::sideeffect) {
595  // Ignore llvm.sideeffect calls.
596  } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
597  LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
598  << '\n');
599  break;
600  } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
601  LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
602  << '\n');
603  break;
604  }
605  }
606 
607  OrderedBasicBlock OBB(Chain[0]->getParent());
608 
609  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
610  unsigned ChainInstrIdx = 0;
611  Instruction *BarrierMemoryInstr = nullptr;
612 
613  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
614  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
615 
616  // If a barrier memory instruction was found, chain instructions that follow
617  // will not be added to the valid prefix.
618  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
619  break;
620 
621  // Check (in BB order) if any instruction prevents ChainInstr from being
622  // vectorized. Find and store the first such "conflicting" instruction.
623  for (Instruction *MemInstr : MemoryInstrs) {
624  // If a barrier memory instruction was found, do not check past it.
625  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
626  break;
627 
628  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
629  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
630  if (MemLoad && ChainLoad)
631  continue;
632 
633  // We can ignore the alias if the we have a load store pair and the load
634  // is known to be invariant. The load cannot be clobbered by the store.
635  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
637  };
638 
639  // We can ignore the alias as long as the load comes before the store,
640  // because that means we won't be moving the load past the store to
641  // vectorize it (the vectorized load is inserted at the location of the
642  // first load in the chain).
643  if (isa<StoreInst>(MemInstr) && ChainLoad &&
644  (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
645  continue;
646 
647  // Same case, but in reverse.
648  if (MemLoad && isa<StoreInst>(ChainInstr) &&
649  (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
650  continue;
651 
652  if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
653  MemoryLocation::get(ChainInstr))) {
654  LLVM_DEBUG({
655  dbgs() << "LSV: Found alias:\n"
656  " Aliasing instruction and pointer:\n"
657  << " " << *MemInstr << '\n'
658  << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
659  << " Aliased instruction and pointer:\n"
660  << " " << *ChainInstr << '\n'
661  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
662  });
663  // Save this aliasing memory instruction as a barrier, but allow other
664  // instructions that precede the barrier to be vectorized with this one.
665  BarrierMemoryInstr = MemInstr;
666  break;
667  }
668  }
669  // Continue the search only for store chains, since vectorizing stores that
670  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
671  // up to an aliasing store, but should not pull loads from further down in
672  // the basic block.
673  if (IsLoadChain && BarrierMemoryInstr) {
674  // The BarrierMemoryInstr is a store that precedes ChainInstr.
675  assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
676  break;
677  }
678  }
679 
680  // Find the largest prefix of Chain whose elements are all in
681  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
682  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
683  // order.)
684  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
685  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
686  unsigned ChainIdx = 0;
687  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
688  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
689  break;
690  }
691  return Chain.slice(0, ChainIdx);
692 }
693 
694 static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
695  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
696  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
697  // The select's themselves are distinct instructions even if they share the
698  // same condition and evaluate to consecutive pointers for true and false
699  // values of the condition. Therefore using the select's themselves for
700  // grouping instructions would put consecutive accesses into different lists
701  // and they won't be even checked for being consecutive, and won't be
702  // vectorized.
703  return Sel->getCondition();
704  }
705  return ObjPtr;
706 }
707 
708 std::pair<InstrListMap, InstrListMap>
709 Vectorizer::collectInstructions(BasicBlock *BB) {
710  InstrListMap LoadRefs;
711  InstrListMap StoreRefs;
712 
713  for (Instruction &I : *BB) {
714  if (!I.mayReadOrWriteMemory())
715  continue;
716 
717  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
718  if (!LI->isSimple())
719  continue;
720 
721  // Skip if it's not legal.
722  if (!TTI.isLegalToVectorizeLoad(LI))
723  continue;
724 
725  Type *Ty = LI->getType();
727  continue;
728 
729  // Skip weird non-byte sizes. They probably aren't worth the effort of
730  // handling correctly.
731  unsigned TySize = DL.getTypeSizeInBits(Ty);
732  if ((TySize % 8) != 0)
733  continue;
734 
735  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
736  // functions are currently using an integer type for the vectorized
737  // load/store, and does not support casting between the integer type and a
738  // vector of pointers (e.g. i64 to <2 x i16*>)
739  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
740  continue;
741 
742  Value *Ptr = LI->getPointerOperand();
743  unsigned AS = Ptr->getType()->getPointerAddressSpace();
744  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
745 
746  unsigned VF = VecRegSize / TySize;
747  VectorType *VecTy = dyn_cast<VectorType>(Ty);
748 
749  // No point in looking at these if they're too big to vectorize.
750  if (TySize > VecRegSize / 2 ||
751  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
752  continue;
753 
754  // Make sure all the users of a vector are constant-index extracts.
755  if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
757  return EEI && isa<ConstantInt>(EEI->getOperand(1));
758  }))
759  continue;
760 
761  // Save the load locations.
762  const ChainID ID = getChainID(Ptr, DL);
763  LoadRefs[ID].push_back(LI);
764  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
765  if (!SI->isSimple())
766  continue;
767 
768  // Skip if it's not legal.
769  if (!TTI.isLegalToVectorizeStore(SI))
770  continue;
771 
772  Type *Ty = SI->getValueOperand()->getType();
774  continue;
775 
776  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
777  // functions are currently using an integer type for the vectorized
778  // load/store, and does not support casting between the integer type and a
779  // vector of pointers (e.g. i64 to <2 x i16*>)
780  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
781  continue;
782 
783  // Skip weird non-byte sizes. They probably aren't worth the effort of
784  // handling correctly.
785  unsigned TySize = DL.getTypeSizeInBits(Ty);
786  if ((TySize % 8) != 0)
787  continue;
788 
789  Value *Ptr = SI->getPointerOperand();
790  unsigned AS = Ptr->getType()->getPointerAddressSpace();
791  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
792 
793  unsigned VF = VecRegSize / TySize;
794  VectorType *VecTy = dyn_cast<VectorType>(Ty);
795 
796  // No point in looking at these if they're too big to vectorize.
797  if (TySize > VecRegSize / 2 ||
798  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
799  continue;
800 
801  if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
803  return EEI && isa<ConstantInt>(EEI->getOperand(1));
804  }))
805  continue;
806 
807  // Save store location.
808  const ChainID ID = getChainID(Ptr, DL);
809  StoreRefs[ID].push_back(SI);
810  }
811  }
812 
813  return {LoadRefs, StoreRefs};
814 }
815 
816 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
817  bool Changed = false;
818 
819  for (const std::pair<ChainID, InstrList> &Chain : Map) {
820  unsigned Size = Chain.second.size();
821  if (Size < 2)
822  continue;
823 
824  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
825 
826  // Process the stores in chunks of 64.
827  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
828  unsigned Len = std::min<unsigned>(CE - CI, 64);
829  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
830  Changed |= vectorizeInstructions(Chunk);
831  }
832  }
833 
834  return Changed;
835 }
836 
837 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
838  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
839  << " instructions.\n");
840  SmallVector<int, 16> Heads, Tails;
841  int ConsecutiveChain[64];
842 
843  // Do a quadratic search on all of the given loads/stores and find all of the
844  // pairs of loads/stores that follow each other.
845  for (int i = 0, e = Instrs.size(); i < e; ++i) {
846  ConsecutiveChain[i] = -1;
847  for (int j = e - 1; j >= 0; --j) {
848  if (i == j)
849  continue;
850 
851  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
852  if (ConsecutiveChain[i] != -1) {
853  int CurDistance = std::abs(ConsecutiveChain[i] - i);
854  int NewDistance = std::abs(ConsecutiveChain[i] - j);
855  if (j < i || NewDistance > CurDistance)
856  continue; // Should not insert.
857  }
858 
859  Tails.push_back(j);
860  Heads.push_back(i);
861  ConsecutiveChain[i] = j;
862  }
863  }
864  }
865 
866  bool Changed = false;
867  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
868 
869  for (int Head : Heads) {
870  if (InstructionsProcessed.count(Instrs[Head]))
871  continue;
872  bool LongerChainExists = false;
873  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
874  if (Head == Tails[TIt] &&
875  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
876  LongerChainExists = true;
877  break;
878  }
879  if (LongerChainExists)
880  continue;
881 
882  // We found an instr that starts a chain. Now follow the chain and try to
883  // vectorize it.
885  int I = Head;
886  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
887  if (InstructionsProcessed.count(Instrs[I]))
888  break;
889 
890  Operands.push_back(Instrs[I]);
891  I = ConsecutiveChain[I];
892  }
893 
894  bool Vectorized = false;
895  if (isa<LoadInst>(*Operands.begin()))
896  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
897  else
898  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
899 
900  Changed |= Vectorized;
901  }
902 
903  return Changed;
904 }
905 
906 bool Vectorizer::vectorizeStoreChain(
908  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
909  StoreInst *S0 = cast<StoreInst>(Chain[0]);
910 
911  // If the vector has an int element, default to int for the whole store.
912  Type *StoreTy;
913  for (Instruction *I : Chain) {
914  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
915  if (StoreTy->isIntOrIntVectorTy())
916  break;
917 
918  if (StoreTy->isPtrOrPtrVectorTy()) {
919  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
920  DL.getTypeSizeInBits(StoreTy));
921  break;
922  }
923  }
924 
925  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
926  unsigned AS = S0->getPointerAddressSpace();
927  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
928  unsigned VF = VecRegSize / Sz;
929  unsigned ChainSize = Chain.size();
930  unsigned Alignment = getAlignment(S0);
931 
932  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
933  InstructionsProcessed->insert(Chain.begin(), Chain.end());
934  return false;
935  }
936 
937  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
938  if (NewChain.empty()) {
939  // No vectorization possible.
940  InstructionsProcessed->insert(Chain.begin(), Chain.end());
941  return false;
942  }
943  if (NewChain.size() == 1) {
944  // Failed after the first instruction. Discard it and try the smaller chain.
945  InstructionsProcessed->insert(NewChain.front());
946  return false;
947  }
948 
949  // Update Chain to the valid vectorizable subchain.
950  Chain = NewChain;
951  ChainSize = Chain.size();
952 
953  // Check if it's legal to vectorize this chain. If not, split the chain and
954  // try again.
955  unsigned EltSzInBytes = Sz / 8;
956  unsigned SzInBytes = EltSzInBytes * ChainSize;
957 
958  VectorType *VecTy;
959  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
960  if (VecStoreTy)
961  VecTy = VectorType::get(StoreTy->getScalarType(),
962  Chain.size() * VecStoreTy->getNumElements());
963  else
964  VecTy = VectorType::get(StoreTy, Chain.size());
965 
966  // If it's more than the max vector size or the target has a better
967  // vector factor, break it into two pieces.
968  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
969  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
970  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
971  " Creating two separate arrays.\n");
972  return vectorizeStoreChain(Chain.slice(0, TargetVF),
973  InstructionsProcessed) |
974  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
975  }
976 
977  LLVM_DEBUG({
978  dbgs() << "LSV: Stores to vectorize:\n";
979  for (Instruction *I : Chain)
980  dbgs() << " " << *I << "\n";
981  });
982 
983  // We won't try again to vectorize the elements of the chain, regardless of
984  // whether we succeed below.
985  InstructionsProcessed->insert(Chain.begin(), Chain.end());
986 
987  // If the store is going to be misaligned, don't vectorize it.
988  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
989  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
990  auto Chains = splitOddVectorElts(Chain, Sz);
991  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
992  vectorizeStoreChain(Chains.second, InstructionsProcessed);
993  }
994 
995  unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
997  DL, S0, nullptr, &DT);
998  if (NewAlign != 0)
999  Alignment = NewAlign;
1000  }
1001 
1002  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1003  auto Chains = splitOddVectorElts(Chain, Sz);
1004  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1005  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1006  }
1007 
1008  BasicBlock::iterator First, Last;
1009  std::tie(First, Last) = getBoundaryInstrs(Chain);
1010  Builder.SetInsertPoint(&*Last);
1011 
1012  Value *Vec = UndefValue::get(VecTy);
1013 
1014  if (VecStoreTy) {
1015  unsigned VecWidth = VecStoreTy->getNumElements();
1016  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1017  StoreInst *Store = cast<StoreInst>(Chain[I]);
1018  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1019  unsigned NewIdx = J + I * VecWidth;
1020  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1021  Builder.getInt32(J));
1022  if (Extract->getType() != StoreTy->getScalarType())
1023  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1024 
1025  Value *Insert =
1026  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1027  Vec = Insert;
1028  }
1029  }
1030  } else {
1031  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1032  StoreInst *Store = cast<StoreInst>(Chain[I]);
1033  Value *Extract = Store->getValueOperand();
1034  if (Extract->getType() != StoreTy->getScalarType())
1035  Extract =
1036  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1037 
1038  Value *Insert =
1039  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1040  Vec = Insert;
1041  }
1042  }
1043 
1044  StoreInst *SI = Builder.CreateAlignedStore(
1045  Vec,
1046  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1047  Alignment);
1048  propagateMetadata(SI, Chain);
1049 
1050  eraseInstructions(Chain);
1051  ++NumVectorInstructions;
1052  NumScalarsVectorized += Chain.size();
1053  return true;
1054 }
1055 
1056 bool Vectorizer::vectorizeLoadChain(
1058  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1059  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1060 
1061  // If the vector has an int element, default to int for the whole load.
1062  Type *LoadTy;
1063  for (const auto &V : Chain) {
1064  LoadTy = cast<LoadInst>(V)->getType();
1065  if (LoadTy->isIntOrIntVectorTy())
1066  break;
1067 
1068  if (LoadTy->isPtrOrPtrVectorTy()) {
1069  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1070  DL.getTypeSizeInBits(LoadTy));
1071  break;
1072  }
1073  }
1074 
1075  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1076  unsigned AS = L0->getPointerAddressSpace();
1077  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1078  unsigned VF = VecRegSize / Sz;
1079  unsigned ChainSize = Chain.size();
1080  unsigned Alignment = getAlignment(L0);
1081 
1082  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1083  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1084  return false;
1085  }
1086 
1087  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1088  if (NewChain.empty()) {
1089  // No vectorization possible.
1090  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1091  return false;
1092  }
1093  if (NewChain.size() == 1) {
1094  // Failed after the first instruction. Discard it and try the smaller chain.
1095  InstructionsProcessed->insert(NewChain.front());
1096  return false;
1097  }
1098 
1099  // Update Chain to the valid vectorizable subchain.
1100  Chain = NewChain;
1101  ChainSize = Chain.size();
1102 
1103  // Check if it's legal to vectorize this chain. If not, split the chain and
1104  // try again.
1105  unsigned EltSzInBytes = Sz / 8;
1106  unsigned SzInBytes = EltSzInBytes * ChainSize;
1107  VectorType *VecTy;
1108  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1109  if (VecLoadTy)
1110  VecTy = VectorType::get(LoadTy->getScalarType(),
1111  Chain.size() * VecLoadTy->getNumElements());
1112  else
1113  VecTy = VectorType::get(LoadTy, Chain.size());
1114 
1115  // If it's more than the max vector size or the target has a better
1116  // vector factor, break it into two pieces.
1117  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1118  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1119  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1120  " Creating two separate arrays.\n");
1121  return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1122  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1123  }
1124 
1125  // We won't try again to vectorize the elements of the chain, regardless of
1126  // whether we succeed below.
1127  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1128 
1129  // If the load is going to be misaligned, don't vectorize it.
1130  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1131  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1132  auto Chains = splitOddVectorElts(Chain, Sz);
1133  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1134  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1135  }
1136 
1137  unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
1139  DL, L0, nullptr, &DT);
1140  if (NewAlign != 0)
1141  Alignment = NewAlign;
1142 
1143  Alignment = NewAlign;
1144  }
1145 
1146  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1147  auto Chains = splitOddVectorElts(Chain, Sz);
1148  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1149  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1150  }
1151 
1152  LLVM_DEBUG({
1153  dbgs() << "LSV: Loads to vectorize:\n";
1154  for (Instruction *I : Chain)
1155  I->dump();
1156  });
1157 
1158  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1159  // Last may have changed since then, but the value of First won't have. If it
1160  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1161  BasicBlock::iterator First, Last;
1162  std::tie(First, Last) = getBoundaryInstrs(Chain);
1163  Builder.SetInsertPoint(&*First);
1164 
1165  Value *Bitcast =
1166  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1167  LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment);
1168  propagateMetadata(LI, Chain);
1169 
1170  if (VecLoadTy) {
1171  SmallVector<Instruction *, 16> InstrsToErase;
1172 
1173  unsigned VecWidth = VecLoadTy->getNumElements();
1174  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1175  for (auto Use : Chain[I]->users()) {
1176  // All users of vector loads are ExtractElement instructions with
1177  // constant indices, otherwise we would have bailed before now.
1178  Instruction *UI = cast<Instruction>(Use);
1179  unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1180  unsigned NewIdx = Idx + I * VecWidth;
1181  Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1182  UI->getName());
1183  if (V->getType() != UI->getType())
1184  V = Builder.CreateBitCast(V, UI->getType());
1185 
1186  // Replace the old instruction.
1187  UI->replaceAllUsesWith(V);
1188  InstrsToErase.push_back(UI);
1189  }
1190  }
1191 
1192  // Bitcast might not be an Instruction, if the value being loaded is a
1193  // constant. In that case, no need to reorder anything.
1194  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1195  reorder(BitcastInst);
1196 
1197  for (auto I : InstrsToErase)
1198  I->eraseFromParent();
1199  } else {
1200  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1201  Value *CV = Chain[I];
1202  Value *V =
1203  Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1204  if (V->getType() != CV->getType()) {
1205  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1206  }
1207 
1208  // Replace the old instruction.
1209  CV->replaceAllUsesWith(V);
1210  }
1211 
1212  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1213  reorder(BitcastInst);
1214  }
1215 
1216  eraseInstructions(Chain);
1217 
1218  ++NumVectorInstructions;
1219  NumScalarsVectorized += Chain.size();
1220  return true;
1221 }
1222 
1223 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1224  unsigned Alignment) {
1225  if (Alignment % SzInBytes == 0)
1226  return false;
1227 
1228  bool Fast = false;
1229  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1230  SzInBytes * 8, AddressSpace,
1231  Alignment, &Fast);
1232  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1233  << " and fast? " << Fast << "\n";);
1234  return !Allows || !Fast;
1235 }
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:152
uint64_t CallInst * C
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Value * getValueOperand()
Definition: Instructions.h:399
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1175
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
iterator begin() const
Definition: ArrayRef.h:137
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal].
The main scalar evolution driver.
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:858
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1520
bool dominates(const Instruction *A, const Instruction *B)
Find out whether A dominates B, meaning whether A comes before B in BB.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:321
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:38
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1042
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:811
STATISTIC(NumFunctions, "Total number of functions")
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:168
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1233
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1503
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4243
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static uint32_t getAlignment(const MCSectionCOFF &Sec)
Type * getPointerElementType() const
Definition: Type.h:376
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:363
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:242
APInt zextOrSelf(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:892
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
LLVMContext & getContext() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:731
This file contains the simple types necessary to represent the attributes associated with functions a...
uint64_t getNumElements() const
Definition: DerivedTypes.h:359
This file implements a class to represent arbitrary precision integral constant values and operations...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:408
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
void initializeLoadStoreVectorizerPass(PassRegistry &)
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1456
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:608
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:217
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
Value * getLoadStorePointerOperand(Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Pass * createLoadStoreVectorizerPass()
An instruction for storing to memory.
Definition: Instructions.h:310
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:439
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
Value * getOperand(unsigned i) const
Definition: User.h:170
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:304
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:841
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:364
static bool runOnFunction(Function &F, bool PostInlining)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Wrapper pass for TargetTransformInfo.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1613
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:429
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:74
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
static const unsigned StackAdjustedAlignment
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1179
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:149
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:129
#define DEBUG_TYPE
This file contains the declarations for the subclasses of Constant, which represent the different fla...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
bool mayThrow() const
Return true if this instruction may throw an exception.
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:443
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Value * getPointerOperand()
Definition: Instructions.h:274
iterator_range< po_iterator< T > > post_order(const T &G)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1392
size_t size() const
Definition: SmallVector.h:53
static wasm::ValType getType(const TargetRegisterClass *RC)
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
const unsigned MaxDepth
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:227
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:192
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
static unsigned getIntrinsicID(const SDNode *N)
Legacy wrapper pass to provide the SCEVAAResult object.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:266
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:130
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
Module.h This file contains the declarations for the Module class.
AddressSpace
Definition: NVPTXBaseInfo.h:22
iterator end() const
Definition: ArrayRef.h:138
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:722
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:381
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:180
void negate()
Negate this APInt in place.
Definition: APInt.h:1487
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:43
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:924
Class to represent vector types.
Definition: DerivedTypes.h:393
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
Accumulate offsets from stripInBoundsConstantOffsets().
Definition: Value.cpp:557
Class for arbitrary precision integers.
Definition: APInt.h:70
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Definition: ArrayRef.h:179
void removeFromParent()
This method unlinks &#39;this&#39; from the containing basic block, but does not delete it.
Definition: Instruction.cpp:64
iv users
Definition: IVUsers.cpp:52
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:230
This class represents an analyzed expression in the program.
static ChainID getChainID(const Value *Ptr, const DataLayout &DL)
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
This file provides utility analysis objects describing memory locations.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:224
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
This instruction extracts a single (scalar) element from a VectorType value.
uint32_t Size
Definition: Profile.cpp:47
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:355
INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_END(LoadStoreVectorizer
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:280
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:566
LLVM Value Representation.
Definition: Value.h:73
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:593
static const Function * getParent(const Value *V)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
inst_range instructions(Function *F)
Definition: InstIterator.h:134
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:260
This pass exposes codegen information to IR-level passes.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:123
Value * getPointerOperand()
Definition: Instructions.h:402
bool use_empty() const
Definition: Value.h:323
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:144
const BasicBlock * getParent() const
Definition: Instruction.h:67
gep_type_iterator gep_type_begin(const User *GEP)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1101
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:518