LLVM  9.0.0svn
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 
41 #include "llvm/ADT/APInt.h"
42 #include "llvm/ADT/ArrayRef.h"
43 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/STLExtras.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
106 /// be relied upon.
107 using ChainID = const Value *;
108 using InstrList = SmallVector<Instruction *, 8>;
109 using InstrListMap = MapVector<ChainID, InstrList>;
110 
111 class Vectorizer {
112  Function &F;
113  AliasAnalysis &AA;
114  DominatorTree &DT;
115  ScalarEvolution &SE;
116  TargetTransformInfo &TTI;
117  const DataLayout &DL;
118  IRBuilder<> Builder;
119 
120 public:
121  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
123  : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
124  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
125 
126  bool run();
127 
128 private:
129  unsigned getPointerAddressSpace(Value *I);
130 
131  unsigned getAlignment(LoadInst *LI) const {
132  unsigned Align = LI->getAlignment();
133  if (Align != 0)
134  return Align;
135 
136  return DL.getABITypeAlignment(LI->getType());
137  }
138 
139  unsigned getAlignment(StoreInst *SI) const {
140  unsigned Align = SI->getAlignment();
141  if (Align != 0)
142  return Align;
143 
144  return DL.getABITypeAlignment(SI->getValueOperand()->getType());
145  }
146 
147  static const unsigned MaxDepth = 3;
148 
149  bool isConsecutiveAccess(Value *A, Value *B);
150  bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
151  unsigned Depth = 0) const;
152  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
153  unsigned Depth) const;
154  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
155  unsigned Depth) const;
156 
157  /// After vectorization, reorder the instructions that I depends on
158  /// (the instructions defining its operands), to ensure they dominate I.
159  void reorder(Instruction *I);
160 
161  /// Returns the first and the last instructions in Chain.
162  std::pair<BasicBlock::iterator, BasicBlock::iterator>
163  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
164 
165  /// Erases the original instructions after vectorizing.
166  void eraseInstructions(ArrayRef<Instruction *> Chain);
167 
168  /// "Legalize" the vector type that would be produced by combining \p
169  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171  /// expected to have more than 4 elements.
172  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
173  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
174 
175  /// Finds the largest prefix of Chain that's vectorizable, checking for
176  /// intervening instructions which may affect the memory accessed by the
177  /// instructions within Chain.
178  ///
179  /// The elements of \p Chain must be all loads or all stores and must be in
180  /// address order.
181  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
182 
183  /// Collects load and store instructions to vectorize.
184  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
185 
186  /// Processes the collected instructions, the \p Map. The values of \p Map
187  /// should be all loads or all stores.
188  bool vectorizeChains(InstrListMap &Map);
189 
190  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
192 
193  /// Vectorizes the load instructions in Chain.
194  bool
195  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
196  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
197 
198  /// Vectorizes the store instructions in Chain.
199  bool
200  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
201  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
202 
203  /// Check if this load/store access is misaligned accesses.
204  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
205  unsigned Alignment);
206 };
207 
208 class LoadStoreVectorizerLegacyPass : public FunctionPass {
209 public:
210  static char ID;
211 
212  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
214  }
215 
216  bool runOnFunction(Function &F) override;
217 
218  StringRef getPassName() const override {
219  return "GPU Load and Store Vectorizer";
220  }
221 
222  void getAnalysisUsage(AnalysisUsage &AU) const override {
227  AU.setPreservesCFG();
228  }
229 };
230 
231 } // end anonymous namespace
232 
234 
235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
236  "Vectorize load and Store instructions", false, false)
242 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
243  "Vectorize load and store instructions", false, false)
244 
246  return new LoadStoreVectorizerLegacyPass();
247 }
248 
250  // Don't vectorize when the attribute NoImplicitFloat is used.
251  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
252  return false;
253 
254  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
255  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
256  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
257  TargetTransformInfo &TTI =
258  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
259 
260  Vectorizer V(F, AA, DT, SE, TTI);
261  return V.run();
262 }
263 
265  // Don't vectorize when the attribute NoImplicitFloat is used.
266  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
267  return PreservedAnalyses::all();
268 
269  AliasAnalysis &AA = AM.getResult<AAManager>(F);
273 
274  Vectorizer V(F, AA, DT, SE, TTI);
275  bool Changed = V.run();
277  PA.preserveSet<CFGAnalyses>();
278  return Changed ? PA : PreservedAnalyses::all();
279 }
280 
281 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
282 // vectors of Instructions.
284  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
285  propagateMetadata(I, VL);
286 }
287 
288 // Vectorizer Implementation
289 bool Vectorizer::run() {
290  bool Changed = false;
291 
292  // Scan the blocks in the function in post order.
293  for (BasicBlock *BB : post_order(&F)) {
294  InstrListMap LoadRefs, StoreRefs;
295  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
296  Changed |= vectorizeChains(LoadRefs);
297  Changed |= vectorizeChains(StoreRefs);
298  }
299 
300  return Changed;
301 }
302 
303 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
304  if (LoadInst *L = dyn_cast<LoadInst>(I))
305  return L->getPointerAddressSpace();
306  if (StoreInst *S = dyn_cast<StoreInst>(I))
307  return S->getPointerAddressSpace();
308  return -1;
309 }
310 
311 // FIXME: Merge with llvm::isConsecutiveAccess
315  unsigned ASA = getPointerAddressSpace(A);
316  unsigned ASB = getPointerAddressSpace(B);
317 
318  // Check that the address spaces match and that the pointers are valid.
319  if (!PtrA || !PtrB || (ASA != ASB))
320  return false;
321 
322  // Make sure that A and B are different pointers of the same size type.
323  Type *PtrATy = PtrA->getType()->getPointerElementType();
324  Type *PtrBTy = PtrB->getType()->getPointerElementType();
325  if (PtrA == PtrB ||
326  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
327  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
328  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
329  DL.getTypeStoreSize(PtrBTy->getScalarType()))
330  return false;
331 
332  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
333  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
334 
335  return areConsecutivePointers(PtrA, PtrB, Size);
336 }
337 
338 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
339  const APInt &PtrDelta,
340  unsigned Depth) const {
341  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
342  APInt OffsetA(PtrBitWidth, 0);
343  APInt OffsetB(PtrBitWidth, 0);
344  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
345  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
346 
347  APInt OffsetDelta = OffsetB - OffsetA;
348 
349  // Check if they are based on the same pointer. That makes the offsets
350  // sufficient.
351  if (PtrA == PtrB)
352  return OffsetDelta == PtrDelta;
353 
354  // Compute the necessary base pointer delta to have the necessary final delta
355  // equal to the pointer delta requested.
356  APInt BaseDelta = PtrDelta - OffsetDelta;
357 
358  // Compute the distance with SCEV between the base pointers.
359  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
360  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
361  const SCEV *C = SE.getConstant(BaseDelta);
362  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
363  if (X == PtrSCEVB)
364  return true;
365 
366  // The above check will not catch the cases where one of the pointers is
367  // factorized but the other one is not, such as (C + (S * (A + B))) vs
368  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
369  // and getting the simplified difference.
370  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
371  if (C == Dist)
372  return true;
373 
374  // Sometimes even this doesn't work, because SCEV can't always see through
375  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
376  // things the hard way.
377  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
378 }
379 
380 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
381  APInt PtrDelta,
382  unsigned Depth) const {
383  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
384  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
385  if (!GEPA || !GEPB)
386  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
387 
388  // Look through GEPs after checking they're the same except for the last
389  // index.
390  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
391  GEPA->getPointerOperand() != GEPB->getPointerOperand())
392  return false;
393  gep_type_iterator GTIA = gep_type_begin(GEPA);
394  gep_type_iterator GTIB = gep_type_begin(GEPB);
395  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
396  if (GTIA.getOperand() != GTIB.getOperand())
397  return false;
398  ++GTIA;
399  ++GTIB;
400  }
401 
402  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
403  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
404  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
405  OpA->getType() != OpB->getType())
406  return false;
407 
408  if (PtrDelta.isNegative()) {
409  if (PtrDelta.isMinSignedValue())
410  return false;
411  PtrDelta.negate();
412  std::swap(OpA, OpB);
413  }
414  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
415  if (PtrDelta.urem(Stride) != 0)
416  return false;
417  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
418  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
419 
420  // Only look through a ZExt/SExt.
421  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
422  return false;
423 
424  bool Signed = isa<SExtInst>(OpA);
425 
426  // At this point A could be a function parameter, i.e. not an instruction
427  Value *ValA = OpA->getOperand(0);
428  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
429  if (!OpB || ValA->getType() != OpB->getType())
430  return false;
431 
432  // Now we need to prove that adding IdxDiff to ValA won't overflow.
433  bool Safe = false;
434  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
435  // ValA, we're okay.
436  if (OpB->getOpcode() == Instruction::Add &&
437  isa<ConstantInt>(OpB->getOperand(1)) &&
438  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
439  if (Signed)
440  Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
441  else
442  Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
443  }
444 
445  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
446 
447  // Second attempt:
448  // If all set bits of IdxDiff or any higher order bit other than the sign bit
449  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
450  // overflow of any sort.
451  if (!Safe) {
452  OpA = dyn_cast<Instruction>(ValA);
453  if (!OpA)
454  return false;
455  KnownBits Known(BitWidth);
456  computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
457  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
458  if (Signed)
459  BitsAllowedToBeSet.clearBit(BitWidth - 1);
460  if (BitsAllowedToBeSet.ult(IdxDiff))
461  return false;
462  }
463 
464  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
465  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
466  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
467  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
468  return X == OffsetSCEVB;
469 }
470 
471 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
472  const APInt &PtrDelta,
473  unsigned Depth) const {
474  if (Depth++ == MaxDepth)
475  return false;
476 
477  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
478  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
479  return SelectA->getCondition() == SelectB->getCondition() &&
480  areConsecutivePointers(SelectA->getTrueValue(),
481  SelectB->getTrueValue(), PtrDelta, Depth) &&
482  areConsecutivePointers(SelectA->getFalseValue(),
483  SelectB->getFalseValue(), PtrDelta, Depth);
484  }
485  }
486  return false;
487 }
488 
489 void Vectorizer::reorder(Instruction *I) {
490  OrderedBasicBlock OBB(I->getParent());
491  SmallPtrSet<Instruction *, 16> InstructionsToMove;
493 
494  Worklist.push_back(I);
495  while (!Worklist.empty()) {
496  Instruction *IW = Worklist.pop_back_val();
497  int NumOperands = IW->getNumOperands();
498  for (int i = 0; i < NumOperands; i++) {
500  if (!IM || IM->getOpcode() == Instruction::PHI)
501  continue;
502 
503  // If IM is in another BB, no need to move it, because this pass only
504  // vectorizes instructions within one BB.
505  if (IM->getParent() != I->getParent())
506  continue;
507 
508  if (!OBB.dominates(IM, I)) {
509  InstructionsToMove.insert(IM);
510  Worklist.push_back(IM);
511  }
512  }
513  }
514 
515  // All instructions to move should follow I. Start from I, not from begin().
516  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
517  ++BBI) {
518  if (!InstructionsToMove.count(&*BBI))
519  continue;
520  Instruction *IM = &*BBI;
521  --BBI;
522  IM->removeFromParent();
523  IM->insertBefore(I);
524  }
525 }
526 
527 std::pair<BasicBlock::iterator, BasicBlock::iterator>
528 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
529  Instruction *C0 = Chain[0];
530  BasicBlock::iterator FirstInstr = C0->getIterator();
531  BasicBlock::iterator LastInstr = C0->getIterator();
532 
533  BasicBlock *BB = C0->getParent();
534  unsigned NumFound = 0;
535  for (Instruction &I : *BB) {
536  if (!is_contained(Chain, &I))
537  continue;
538 
539  ++NumFound;
540  if (NumFound == 1) {
541  FirstInstr = I.getIterator();
542  }
543  if (NumFound == Chain.size()) {
544  LastInstr = I.getIterator();
545  break;
546  }
547  }
548 
549  // Range is [first, last).
550  return std::make_pair(FirstInstr, ++LastInstr);
551 }
552 
553 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
555  for (Instruction *I : Chain) {
556  Value *PtrOperand = getLoadStorePointerOperand(I);
557  assert(PtrOperand && "Instruction must have a pointer operand.");
558  Instrs.push_back(I);
559  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
560  Instrs.push_back(GEP);
561  }
562 
563  // Erase instructions.
564  for (Instruction *I : Instrs)
565  if (I->use_empty())
566  I->eraseFromParent();
567 }
568 
569 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
570 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
571  unsigned ElementSizeBits) {
572  unsigned ElementSizeBytes = ElementSizeBits / 8;
573  unsigned SizeBytes = ElementSizeBytes * Chain.size();
574  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
575  if (NumLeft == Chain.size()) {
576  if ((NumLeft & 1) == 0)
577  NumLeft /= 2; // Split even in half
578  else
579  --NumLeft; // Split off last element
580  } else if (NumLeft == 0)
581  NumLeft = 1;
582  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
583 }
584 
586 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
587  // These are in BB order, unlike Chain, which is in address order.
588  SmallVector<Instruction *, 16> MemoryInstrs;
589  SmallVector<Instruction *, 16> ChainInstrs;
590 
591  bool IsLoadChain = isa<LoadInst>(Chain[0]);
592  LLVM_DEBUG({
593  for (Instruction *I : Chain) {
594  if (IsLoadChain)
595  assert(isa<LoadInst>(I) &&
596  "All elements of Chain must be loads, or all must be stores.");
597  else
598  assert(isa<StoreInst>(I) &&
599  "All elements of Chain must be loads, or all must be stores.");
600  }
601  });
602 
603  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
604  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
605  if (!is_contained(Chain, &I))
606  MemoryInstrs.push_back(&I);
607  else
608  ChainInstrs.push_back(&I);
609  } else if (isa<IntrinsicInst>(&I) &&
610  cast<IntrinsicInst>(&I)->getIntrinsicID() ==
611  Intrinsic::sideeffect) {
612  // Ignore llvm.sideeffect calls.
613  } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
614  LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
615  << '\n');
616  break;
617  } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
618  LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
619  << '\n');
620  break;
621  }
622  }
623 
624  OrderedBasicBlock OBB(Chain[0]->getParent());
625 
626  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
627  unsigned ChainInstrIdx = 0;
628  Instruction *BarrierMemoryInstr = nullptr;
629 
630  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
631  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
632 
633  // If a barrier memory instruction was found, chain instructions that follow
634  // will not be added to the valid prefix.
635  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
636  break;
637 
638  // Check (in BB order) if any instruction prevents ChainInstr from being
639  // vectorized. Find and store the first such "conflicting" instruction.
640  for (Instruction *MemInstr : MemoryInstrs) {
641  // If a barrier memory instruction was found, do not check past it.
642  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
643  break;
644 
645  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
646  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
647  if (MemLoad && ChainLoad)
648  continue;
649 
650  // We can ignore the alias if the we have a load store pair and the load
651  // is known to be invariant. The load cannot be clobbered by the store.
652  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
654  };
655 
656  // We can ignore the alias as long as the load comes before the store,
657  // because that means we won't be moving the load past the store to
658  // vectorize it (the vectorized load is inserted at the location of the
659  // first load in the chain).
660  if (isa<StoreInst>(MemInstr) && ChainLoad &&
661  (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
662  continue;
663 
664  // Same case, but in reverse.
665  if (MemLoad && isa<StoreInst>(ChainInstr) &&
666  (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
667  continue;
668 
669  if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
670  MemoryLocation::get(ChainInstr))) {
671  LLVM_DEBUG({
672  dbgs() << "LSV: Found alias:\n"
673  " Aliasing instruction and pointer:\n"
674  << " " << *MemInstr << '\n'
675  << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
676  << " Aliased instruction and pointer:\n"
677  << " " << *ChainInstr << '\n'
678  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
679  });
680  // Save this aliasing memory instruction as a barrier, but allow other
681  // instructions that precede the barrier to be vectorized with this one.
682  BarrierMemoryInstr = MemInstr;
683  break;
684  }
685  }
686  // Continue the search only for store chains, since vectorizing stores that
687  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
688  // up to an aliasing store, but should not pull loads from further down in
689  // the basic block.
690  if (IsLoadChain && BarrierMemoryInstr) {
691  // The BarrierMemoryInstr is a store that precedes ChainInstr.
692  assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
693  break;
694  }
695  }
696 
697  // Find the largest prefix of Chain whose elements are all in
698  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
699  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
700  // order.)
701  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
702  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
703  unsigned ChainIdx = 0;
704  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
705  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
706  break;
707  }
708  return Chain.slice(0, ChainIdx);
709 }
710 
711 static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
712  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
713  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
714  // The select's themselves are distinct instructions even if they share the
715  // same condition and evaluate to consecutive pointers for true and false
716  // values of the condition. Therefore using the select's themselves for
717  // grouping instructions would put consecutive accesses into different lists
718  // and they won't be even checked for being consecutive, and won't be
719  // vectorized.
720  return Sel->getCondition();
721  }
722  return ObjPtr;
723 }
724 
725 std::pair<InstrListMap, InstrListMap>
726 Vectorizer::collectInstructions(BasicBlock *BB) {
727  InstrListMap LoadRefs;
728  InstrListMap StoreRefs;
729 
730  for (Instruction &I : *BB) {
731  if (!I.mayReadOrWriteMemory())
732  continue;
733 
734  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
735  if (!LI->isSimple())
736  continue;
737 
738  // Skip if it's not legal.
739  if (!TTI.isLegalToVectorizeLoad(LI))
740  continue;
741 
742  Type *Ty = LI->getType();
744  continue;
745 
746  // Skip weird non-byte sizes. They probably aren't worth the effort of
747  // handling correctly.
748  unsigned TySize = DL.getTypeSizeInBits(Ty);
749  if ((TySize % 8) != 0)
750  continue;
751 
752  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
753  // functions are currently using an integer type for the vectorized
754  // load/store, and does not support casting between the integer type and a
755  // vector of pointers (e.g. i64 to <2 x i16*>)
756  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
757  continue;
758 
759  Value *Ptr = LI->getPointerOperand();
760  unsigned AS = Ptr->getType()->getPointerAddressSpace();
761  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
762 
763  unsigned VF = VecRegSize / TySize;
764  VectorType *VecTy = dyn_cast<VectorType>(Ty);
765 
766  // No point in looking at these if they're too big to vectorize.
767  if (TySize > VecRegSize / 2 ||
768  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
769  continue;
770 
771  // Make sure all the users of a vector are constant-index extracts.
772  if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
774  return EEI && isa<ConstantInt>(EEI->getOperand(1));
775  }))
776  continue;
777 
778  // Save the load locations.
779  const ChainID ID = getChainID(Ptr, DL);
780  LoadRefs[ID].push_back(LI);
781  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
782  if (!SI->isSimple())
783  continue;
784 
785  // Skip if it's not legal.
786  if (!TTI.isLegalToVectorizeStore(SI))
787  continue;
788 
789  Type *Ty = SI->getValueOperand()->getType();
791  continue;
792 
793  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
794  // functions are currently using an integer type for the vectorized
795  // load/store, and does not support casting between the integer type and a
796  // vector of pointers (e.g. i64 to <2 x i16*>)
797  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
798  continue;
799 
800  // Skip weird non-byte sizes. They probably aren't worth the effort of
801  // handling correctly.
802  unsigned TySize = DL.getTypeSizeInBits(Ty);
803  if ((TySize % 8) != 0)
804  continue;
805 
806  Value *Ptr = SI->getPointerOperand();
807  unsigned AS = Ptr->getType()->getPointerAddressSpace();
808  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
809 
810  unsigned VF = VecRegSize / TySize;
811  VectorType *VecTy = dyn_cast<VectorType>(Ty);
812 
813  // No point in looking at these if they're too big to vectorize.
814  if (TySize > VecRegSize / 2 ||
815  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
816  continue;
817 
818  if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
820  return EEI && isa<ConstantInt>(EEI->getOperand(1));
821  }))
822  continue;
823 
824  // Save store location.
825  const ChainID ID = getChainID(Ptr, DL);
826  StoreRefs[ID].push_back(SI);
827  }
828  }
829 
830  return {LoadRefs, StoreRefs};
831 }
832 
833 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
834  bool Changed = false;
835 
836  for (const std::pair<ChainID, InstrList> &Chain : Map) {
837  unsigned Size = Chain.second.size();
838  if (Size < 2)
839  continue;
840 
841  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
842 
843  // Process the stores in chunks of 64.
844  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
845  unsigned Len = std::min<unsigned>(CE - CI, 64);
846  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
847  Changed |= vectorizeInstructions(Chunk);
848  }
849  }
850 
851  return Changed;
852 }
853 
854 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
855  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
856  << " instructions.\n");
857  SmallVector<int, 16> Heads, Tails;
858  int ConsecutiveChain[64];
859 
860  // Do a quadratic search on all of the given loads/stores and find all of the
861  // pairs of loads/stores that follow each other.
862  for (int i = 0, e = Instrs.size(); i < e; ++i) {
863  ConsecutiveChain[i] = -1;
864  for (int j = e - 1; j >= 0; --j) {
865  if (i == j)
866  continue;
867 
868  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
869  if (ConsecutiveChain[i] != -1) {
870  int CurDistance = std::abs(ConsecutiveChain[i] - i);
871  int NewDistance = std::abs(ConsecutiveChain[i] - j);
872  if (j < i || NewDistance > CurDistance)
873  continue; // Should not insert.
874  }
875 
876  Tails.push_back(j);
877  Heads.push_back(i);
878  ConsecutiveChain[i] = j;
879  }
880  }
881  }
882 
883  bool Changed = false;
884  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
885 
886  for (int Head : Heads) {
887  if (InstructionsProcessed.count(Instrs[Head]))
888  continue;
889  bool LongerChainExists = false;
890  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
891  if (Head == Tails[TIt] &&
892  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
893  LongerChainExists = true;
894  break;
895  }
896  if (LongerChainExists)
897  continue;
898 
899  // We found an instr that starts a chain. Now follow the chain and try to
900  // vectorize it.
902  int I = Head;
903  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
904  if (InstructionsProcessed.count(Instrs[I]))
905  break;
906 
907  Operands.push_back(Instrs[I]);
908  I = ConsecutiveChain[I];
909  }
910 
911  bool Vectorized = false;
912  if (isa<LoadInst>(*Operands.begin()))
913  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
914  else
915  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
916 
917  Changed |= Vectorized;
918  }
919 
920  return Changed;
921 }
922 
923 bool Vectorizer::vectorizeStoreChain(
925  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
926  StoreInst *S0 = cast<StoreInst>(Chain[0]);
927 
928  // If the vector has an int element, default to int for the whole store.
929  Type *StoreTy;
930  for (Instruction *I : Chain) {
931  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
932  if (StoreTy->isIntOrIntVectorTy())
933  break;
934 
935  if (StoreTy->isPtrOrPtrVectorTy()) {
936  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
937  DL.getTypeSizeInBits(StoreTy));
938  break;
939  }
940  }
941 
942  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
943  unsigned AS = S0->getPointerAddressSpace();
944  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
945  unsigned VF = VecRegSize / Sz;
946  unsigned ChainSize = Chain.size();
947  unsigned Alignment = getAlignment(S0);
948 
949  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
950  InstructionsProcessed->insert(Chain.begin(), Chain.end());
951  return false;
952  }
953 
954  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
955  if (NewChain.empty()) {
956  // No vectorization possible.
957  InstructionsProcessed->insert(Chain.begin(), Chain.end());
958  return false;
959  }
960  if (NewChain.size() == 1) {
961  // Failed after the first instruction. Discard it and try the smaller chain.
962  InstructionsProcessed->insert(NewChain.front());
963  return false;
964  }
965 
966  // Update Chain to the valid vectorizable subchain.
967  Chain = NewChain;
968  ChainSize = Chain.size();
969 
970  // Check if it's legal to vectorize this chain. If not, split the chain and
971  // try again.
972  unsigned EltSzInBytes = Sz / 8;
973  unsigned SzInBytes = EltSzInBytes * ChainSize;
974 
975  VectorType *VecTy;
976  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
977  if (VecStoreTy)
978  VecTy = VectorType::get(StoreTy->getScalarType(),
979  Chain.size() * VecStoreTy->getNumElements());
980  else
981  VecTy = VectorType::get(StoreTy, Chain.size());
982 
983  // If it's more than the max vector size or the target has a better
984  // vector factor, break it into two pieces.
985  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
986  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
987  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
988  " Creating two separate arrays.\n");
989  return vectorizeStoreChain(Chain.slice(0, TargetVF),
990  InstructionsProcessed) |
991  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
992  }
993 
994  LLVM_DEBUG({
995  dbgs() << "LSV: Stores to vectorize:\n";
996  for (Instruction *I : Chain)
997  dbgs() << " " << *I << "\n";
998  });
999 
1000  // We won't try again to vectorize the elements of the chain, regardless of
1001  // whether we succeed below.
1002  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1003 
1004  // If the store is going to be misaligned, don't vectorize it.
1005  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1006  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1007  auto Chains = splitOddVectorElts(Chain, Sz);
1008  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1009  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1010  }
1011 
1012  unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1014  DL, S0, nullptr, &DT);
1015  if (NewAlign != 0)
1016  Alignment = NewAlign;
1017  }
1018 
1019  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1020  auto Chains = splitOddVectorElts(Chain, Sz);
1021  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1022  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1023  }
1024 
1025  BasicBlock::iterator First, Last;
1026  std::tie(First, Last) = getBoundaryInstrs(Chain);
1027  Builder.SetInsertPoint(&*Last);
1028 
1029  Value *Vec = UndefValue::get(VecTy);
1030 
1031  if (VecStoreTy) {
1032  unsigned VecWidth = VecStoreTy->getNumElements();
1033  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1034  StoreInst *Store = cast<StoreInst>(Chain[I]);
1035  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1036  unsigned NewIdx = J + I * VecWidth;
1037  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1038  Builder.getInt32(J));
1039  if (Extract->getType() != StoreTy->getScalarType())
1040  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1041 
1042  Value *Insert =
1043  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1044  Vec = Insert;
1045  }
1046  }
1047  } else {
1048  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1049  StoreInst *Store = cast<StoreInst>(Chain[I]);
1050  Value *Extract = Store->getValueOperand();
1051  if (Extract->getType() != StoreTy->getScalarType())
1052  Extract =
1053  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1054 
1055  Value *Insert =
1056  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1057  Vec = Insert;
1058  }
1059  }
1060 
1061  StoreInst *SI = Builder.CreateAlignedStore(
1062  Vec,
1063  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1064  Alignment);
1065  propagateMetadata(SI, Chain);
1066 
1067  eraseInstructions(Chain);
1068  ++NumVectorInstructions;
1069  NumScalarsVectorized += Chain.size();
1070  return true;
1071 }
1072 
1073 bool Vectorizer::vectorizeLoadChain(
1075  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1076  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1077 
1078  // If the vector has an int element, default to int for the whole load.
1079  Type *LoadTy;
1080  for (const auto &V : Chain) {
1081  LoadTy = cast<LoadInst>(V)->getType();
1082  if (LoadTy->isIntOrIntVectorTy())
1083  break;
1084 
1085  if (LoadTy->isPtrOrPtrVectorTy()) {
1086  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1087  DL.getTypeSizeInBits(LoadTy));
1088  break;
1089  }
1090  }
1091 
1092  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1093  unsigned AS = L0->getPointerAddressSpace();
1094  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1095  unsigned VF = VecRegSize / Sz;
1096  unsigned ChainSize = Chain.size();
1097  unsigned Alignment = getAlignment(L0);
1098 
1099  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1100  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1101  return false;
1102  }
1103 
1104  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1105  if (NewChain.empty()) {
1106  // No vectorization possible.
1107  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1108  return false;
1109  }
1110  if (NewChain.size() == 1) {
1111  // Failed after the first instruction. Discard it and try the smaller chain.
1112  InstructionsProcessed->insert(NewChain.front());
1113  return false;
1114  }
1115 
1116  // Update Chain to the valid vectorizable subchain.
1117  Chain = NewChain;
1118  ChainSize = Chain.size();
1119 
1120  // Check if it's legal to vectorize this chain. If not, split the chain and
1121  // try again.
1122  unsigned EltSzInBytes = Sz / 8;
1123  unsigned SzInBytes = EltSzInBytes * ChainSize;
1124  VectorType *VecTy;
1125  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1126  if (VecLoadTy)
1127  VecTy = VectorType::get(LoadTy->getScalarType(),
1128  Chain.size() * VecLoadTy->getNumElements());
1129  else
1130  VecTy = VectorType::get(LoadTy, Chain.size());
1131 
1132  // If it's more than the max vector size or the target has a better
1133  // vector factor, break it into two pieces.
1134  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1135  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1136  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1137  " Creating two separate arrays.\n");
1138  return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1139  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1140  }
1141 
1142  // We won't try again to vectorize the elements of the chain, regardless of
1143  // whether we succeed below.
1144  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1145 
1146  // If the load is going to be misaligned, don't vectorize it.
1147  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1148  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1149  auto Chains = splitOddVectorElts(Chain, Sz);
1150  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1151  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1152  }
1153 
1154  unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
1156  DL, L0, nullptr, &DT);
1157  if (NewAlign != 0)
1158  Alignment = NewAlign;
1159 
1160  Alignment = NewAlign;
1161  }
1162 
1163  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1164  auto Chains = splitOddVectorElts(Chain, Sz);
1165  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1166  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1167  }
1168 
1169  LLVM_DEBUG({
1170  dbgs() << "LSV: Loads to vectorize:\n";
1171  for (Instruction *I : Chain)
1172  I->dump();
1173  });
1174 
1175  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1176  // Last may have changed since then, but the value of First won't have. If it
1177  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1178  BasicBlock::iterator First, Last;
1179  std::tie(First, Last) = getBoundaryInstrs(Chain);
1180  Builder.SetInsertPoint(&*First);
1181 
1182  Value *Bitcast =
1183  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1184  LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
1185  propagateMetadata(LI, Chain);
1186 
1187  if (VecLoadTy) {
1188  SmallVector<Instruction *, 16> InstrsToErase;
1189 
1190  unsigned VecWidth = VecLoadTy->getNumElements();
1191  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1192  for (auto Use : Chain[I]->users()) {
1193  // All users of vector loads are ExtractElement instructions with
1194  // constant indices, otherwise we would have bailed before now.
1195  Instruction *UI = cast<Instruction>(Use);
1196  unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1197  unsigned NewIdx = Idx + I * VecWidth;
1198  Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1199  UI->getName());
1200  if (V->getType() != UI->getType())
1201  V = Builder.CreateBitCast(V, UI->getType());
1202 
1203  // Replace the old instruction.
1204  UI->replaceAllUsesWith(V);
1205  InstrsToErase.push_back(UI);
1206  }
1207  }
1208 
1209  // Bitcast might not be an Instruction, if the value being loaded is a
1210  // constant. In that case, no need to reorder anything.
1211  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1212  reorder(BitcastInst);
1213 
1214  for (auto I : InstrsToErase)
1215  I->eraseFromParent();
1216  } else {
1217  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1218  Value *CV = Chain[I];
1219  Value *V =
1220  Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1221  if (V->getType() != CV->getType()) {
1222  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1223  }
1224 
1225  // Replace the old instruction.
1226  CV->replaceAllUsesWith(V);
1227  }
1228 
1229  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1230  reorder(BitcastInst);
1231  }
1232 
1233  eraseInstructions(Chain);
1234 
1235  ++NumVectorInstructions;
1236  NumScalarsVectorized += Chain.size();
1237  return true;
1238 }
1239 
1240 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1241  unsigned Alignment) {
1242  if (Alignment % SzInBytes == 0)
1243  return false;
1244 
1245  bool Fast = false;
1246  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1247  SzInBytes * 8, AddressSpace,
1248  Alignment, &Fast);
1249  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1250  << " and fast? " << Fast << "\n";);
1251  return !Allows || !Fast;
1252 }
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:80
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:151
uint64_t CallInst * C
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Value * getValueOperand()
Definition: Instructions.h:409
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:110
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1196
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:769
This class represents lattice values for constants.
Definition: AllocatorList.h:23
iterator begin() const
Definition: ArrayRef.h:136
void push_back(const T &Elt)
Definition: SmallVector.h:211
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal, MD_access_group].
The main scalar evolution driver.
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:857
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1519
bool dominates(const Instruction *A, const Instruction *B)
Find out whether A dominates B, meaning whether A comes before B in BB.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
Analysis pass providing the TargetTransformInfo.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:320
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:37
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1185
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:810
STATISTIC(NumFunctions, "Total number of functions")
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:230
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:534
An instruction for reading from memory.
Definition: Instructions.h:167
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:229
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1238
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1508
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4361
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
static uint32_t getAlignment(const MCSectionCOFF &Sec)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Type * getPointerElementType() const
Definition: Type.h:375
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
APInt zextOrSelf(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:891
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
LLVMContext & getContext() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:742
This file contains the simple types necessary to represent the attributes associated with functions a...
uint64_t getNumElements() const
Definition: DerivedTypes.h:390
This file implements a class to represent arbitrary precision integral constant values and operations...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:418
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1461
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:620
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:234
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:125
Value * getLoadStorePointerOperand(Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
An instruction for storing to memory.
Definition: Instructions.h:320
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
Value * getOperand(unsigned i) const
Definition: User.h:169
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:303
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:873
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:363
static bool runOnFunction(Function &F, bool PostInlining)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1612
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:428
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:73
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
static const unsigned StackAdjustedAlignment
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1184
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define DEBUG_TYPE
This file contains the declarations for the subclasses of Constant, which represent the different fla...
A manager for alias analyses.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool mayThrow() const
Return true if this instruction may throw an exception.
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:442
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Value * getPointerOperand()
Definition: Instructions.h:284
iterator_range< po_iterator< T > > post_order(const T &G)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1424
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
size_t size() const
Definition: SmallVector.h:52
static wasm::ValType getType(const TargetRegisterClass *RC)
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
const unsigned MaxDepth
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:226
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:191
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static unsigned getIntrinsicID(const SDNode *N)
Legacy wrapper pass to provide the SCEVAAResult object.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:270
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:129
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:841
Module.h This file contains the declarations for the Module class.
AddressSpace
Definition: NVPTXBaseInfo.h:21
iterator end() const
Definition: ArrayRef.h:137
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:749
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:374
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:179
void negate()
Negate this APInt in place.
Definition: APInt.h:1492
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class to represent vector types.
Definition: DerivedTypes.h:424
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
Accumulate offsets from stripInBoundsConstantOffsets().
Definition: Value.cpp:547
Class for arbitrary precision integers.
Definition: APInt.h:69
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Definition: ArrayRef.h:178
Represents analyses that only rely on functions&#39; control flow.
Definition: PassManager.h:114
void removeFromParent()
This method unlinks &#39;this&#39; from the containing basic block, but does not delete it.
Definition: Instruction.cpp:63
Analysis pass that exposes the ScalarEvolution for a function.
iv users
Definition: IVUsers.cpp:51
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:240
This class represents an analyzed expression in the program.
static ChainID getChainID(const Value *Ptr, const DataLayout &DL)
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
This file provides utility analysis objects describing memory locations.
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:189
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1212
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
This instruction extracts a single (scalar) element from a VectorType value.
uint32_t Size
Definition: Profile.cpp:46
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:365
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:290
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
LLVM Value Representation.
Definition: Value.h:72
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:605
static const Function * getParent(const Value *V)
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
inst_range instructions(Function *F)
Definition: InstIterator.h:133
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
This pass exposes codegen information to IR-level passes.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:122
Value * getPointerOperand()
Definition: Instructions.h:412
bool use_empty() const
Definition: Value.h:322
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:143
const BasicBlock * getParent() const
Definition: Instruction.h:66
gep_type_iterator gep_type_begin(const User *GEP)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1244
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:532