LLVM  10.0.0svn
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 
41 #include "llvm/ADT/APInt.h"
42 #include "llvm/ADT/ArrayRef.h"
43 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/STLExtras.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
106 /// be relied upon.
107 using ChainID = const Value *;
108 using InstrList = SmallVector<Instruction *, 8>;
109 using InstrListMap = MapVector<ChainID, InstrList>;
110 
111 class Vectorizer {
112  Function &F;
113  AliasAnalysis &AA;
114  DominatorTree &DT;
115  ScalarEvolution &SE;
116  TargetTransformInfo &TTI;
117  const DataLayout &DL;
118  IRBuilder<> Builder;
119 
120 public:
121  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
123  : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
124  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
125 
126  bool run();
127 
128 private:
129  unsigned getPointerAddressSpace(Value *I);
130 
131  unsigned getAlignment(LoadInst *LI) const {
132  unsigned Align = LI->getAlignment();
133  if (Align != 0)
134  return Align;
135 
136  return DL.getABITypeAlignment(LI->getType());
137  }
138 
139  unsigned getAlignment(StoreInst *SI) const {
140  unsigned Align = SI->getAlignment();
141  if (Align != 0)
142  return Align;
143 
144  return DL.getABITypeAlignment(SI->getValueOperand()->getType());
145  }
146 
147  static const unsigned MaxDepth = 3;
148 
149  bool isConsecutiveAccess(Value *A, Value *B);
150  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
151  unsigned Depth = 0) const;
152  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
153  unsigned Depth) const;
154  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
155  unsigned Depth) const;
156 
157  /// After vectorization, reorder the instructions that I depends on
158  /// (the instructions defining its operands), to ensure they dominate I.
159  void reorder(Instruction *I);
160 
161  /// Returns the first and the last instructions in Chain.
162  std::pair<BasicBlock::iterator, BasicBlock::iterator>
163  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
164 
165  /// Erases the original instructions after vectorizing.
166  void eraseInstructions(ArrayRef<Instruction *> Chain);
167 
168  /// "Legalize" the vector type that would be produced by combining \p
169  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171  /// expected to have more than 4 elements.
172  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
173  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
174 
175  /// Finds the largest prefix of Chain that's vectorizable, checking for
176  /// intervening instructions which may affect the memory accessed by the
177  /// instructions within Chain.
178  ///
179  /// The elements of \p Chain must be all loads or all stores and must be in
180  /// address order.
181  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
182 
183  /// Collects load and store instructions to vectorize.
184  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
185 
186  /// Processes the collected instructions, the \p Map. The values of \p Map
187  /// should be all loads or all stores.
188  bool vectorizeChains(InstrListMap &Map);
189 
190  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
192 
193  /// Vectorizes the load instructions in Chain.
194  bool
195  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
196  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
197 
198  /// Vectorizes the store instructions in Chain.
199  bool
200  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
201  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
202 
203  /// Check if this load/store access is misaligned accesses.
204  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
205  unsigned Alignment);
206 };
207 
208 class LoadStoreVectorizerLegacyPass : public FunctionPass {
209 public:
210  static char ID;
211 
212  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
214  }
215 
216  bool runOnFunction(Function &F) override;
217 
218  StringRef getPassName() const override {
219  return "GPU Load and Store Vectorizer";
220  }
221 
222  void getAnalysisUsage(AnalysisUsage &AU) const override {
227  AU.setPreservesCFG();
228  }
229 };
230 
231 } // end anonymous namespace
232 
234 
235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
236  "Vectorize load and Store instructions", false, false)
242 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
243  "Vectorize load and store instructions", false, false)
244 
246  return new LoadStoreVectorizerLegacyPass();
247 }
248 
250  // Don't vectorize when the attribute NoImplicitFloat is used.
251  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
252  return false;
253 
254  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
255  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
256  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
257  TargetTransformInfo &TTI =
258  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
259 
260  Vectorizer V(F, AA, DT, SE, TTI);
261  return V.run();
262 }
263 
265  // Don't vectorize when the attribute NoImplicitFloat is used.
266  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
267  return PreservedAnalyses::all();
268 
269  AliasAnalysis &AA = AM.getResult<AAManager>(F);
273 
274  Vectorizer V(F, AA, DT, SE, TTI);
275  bool Changed = V.run();
277  PA.preserveSet<CFGAnalyses>();
278  return Changed ? PA : PreservedAnalyses::all();
279 }
280 
281 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
282 // vectors of Instructions.
284  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
285  propagateMetadata(I, VL);
286 }
287 
288 // Vectorizer Implementation
289 bool Vectorizer::run() {
290  bool Changed = false;
291 
292  // Scan the blocks in the function in post order.
293  for (BasicBlock *BB : post_order(&F)) {
294  InstrListMap LoadRefs, StoreRefs;
295  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
296  Changed |= vectorizeChains(LoadRefs);
297  Changed |= vectorizeChains(StoreRefs);
298  }
299 
300  return Changed;
301 }
302 
303 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
304  if (LoadInst *L = dyn_cast<LoadInst>(I))
305  return L->getPointerAddressSpace();
306  if (StoreInst *S = dyn_cast<StoreInst>(I))
307  return S->getPointerAddressSpace();
308  return -1;
309 }
310 
311 // FIXME: Merge with llvm::isConsecutiveAccess
315  unsigned ASA = getPointerAddressSpace(A);
316  unsigned ASB = getPointerAddressSpace(B);
317 
318  // Check that the address spaces match and that the pointers are valid.
319  if (!PtrA || !PtrB || (ASA != ASB))
320  return false;
321 
322  // Make sure that A and B are different pointers of the same size type.
323  Type *PtrATy = PtrA->getType()->getPointerElementType();
324  Type *PtrBTy = PtrB->getType()->getPointerElementType();
325  if (PtrA == PtrB ||
326  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
327  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
328  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
329  DL.getTypeStoreSize(PtrBTy->getScalarType()))
330  return false;
331 
332  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
333  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
334 
335  return areConsecutivePointers(PtrA, PtrB, Size);
336 }
337 
338 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
339  APInt PtrDelta, unsigned Depth) const {
340  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
341  APInt OffsetA(PtrBitWidth, 0);
342  APInt OffsetB(PtrBitWidth, 0);
343  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
344  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
345 
346  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
347 
348  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
349  return false;
350 
351  // In case if we have to shrink the pointer
352  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
353  // possible overflow and the value should fit into a smallest data type
354  // used in the cast/gep chain.
355  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
356  OffsetB.getMinSignedBits() <= NewPtrBitWidth);
357 
358  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
359  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
360  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
361 
362  APInt OffsetDelta = OffsetB - OffsetA;
363 
364  // Check if they are based on the same pointer. That makes the offsets
365  // sufficient.
366  if (PtrA == PtrB)
367  return OffsetDelta == PtrDelta;
368 
369  // Compute the necessary base pointer delta to have the necessary final delta
370  // equal to the pointer delta requested.
371  APInt BaseDelta = PtrDelta - OffsetDelta;
372 
373  // Compute the distance with SCEV between the base pointers.
374  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
375  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
376  const SCEV *C = SE.getConstant(BaseDelta);
377  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
378  if (X == PtrSCEVB)
379  return true;
380 
381  // The above check will not catch the cases where one of the pointers is
382  // factorized but the other one is not, such as (C + (S * (A + B))) vs
383  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
384  // and getting the simplified difference.
385  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
386  if (C == Dist)
387  return true;
388 
389  // Sometimes even this doesn't work, because SCEV can't always see through
390  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
391  // things the hard way.
392  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
393 }
394 
395 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
396  APInt PtrDelta,
397  unsigned Depth) const {
398  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
399  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
400  if (!GEPA || !GEPB)
401  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
402 
403  // Look through GEPs after checking they're the same except for the last
404  // index.
405  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
406  GEPA->getPointerOperand() != GEPB->getPointerOperand())
407  return false;
408  gep_type_iterator GTIA = gep_type_begin(GEPA);
409  gep_type_iterator GTIB = gep_type_begin(GEPB);
410  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
411  if (GTIA.getOperand() != GTIB.getOperand())
412  return false;
413  ++GTIA;
414  ++GTIB;
415  }
416 
417  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
418  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
419  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
420  OpA->getType() != OpB->getType())
421  return false;
422 
423  if (PtrDelta.isNegative()) {
424  if (PtrDelta.isMinSignedValue())
425  return false;
426  PtrDelta.negate();
427  std::swap(OpA, OpB);
428  }
429  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
430  if (PtrDelta.urem(Stride) != 0)
431  return false;
432  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
433  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
434 
435  // Only look through a ZExt/SExt.
436  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
437  return false;
438 
439  bool Signed = isa<SExtInst>(OpA);
440 
441  // At this point A could be a function parameter, i.e. not an instruction
442  Value *ValA = OpA->getOperand(0);
443  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
444  if (!OpB || ValA->getType() != OpB->getType())
445  return false;
446 
447  // Now we need to prove that adding IdxDiff to ValA won't overflow.
448  bool Safe = false;
449  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
450  // ValA, we're okay.
451  if (OpB->getOpcode() == Instruction::Add &&
452  isa<ConstantInt>(OpB->getOperand(1)) &&
453  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
454  if (Signed)
455  Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
456  else
457  Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
458  }
459 
460  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
461 
462  // Second attempt:
463  // If all set bits of IdxDiff or any higher order bit other than the sign bit
464  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
465  // overflow of any sort.
466  if (!Safe) {
467  OpA = dyn_cast<Instruction>(ValA);
468  if (!OpA)
469  return false;
470  KnownBits Known(BitWidth);
471  computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
472  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
473  if (Signed)
474  BitsAllowedToBeSet.clearBit(BitWidth - 1);
475  if (BitsAllowedToBeSet.ult(IdxDiff))
476  return false;
477  }
478 
479  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
480  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
481  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
482  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
483  return X == OffsetSCEVB;
484 }
485 
486 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
487  const APInt &PtrDelta,
488  unsigned Depth) const {
489  if (Depth++ == MaxDepth)
490  return false;
491 
492  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
493  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
494  return SelectA->getCondition() == SelectB->getCondition() &&
495  areConsecutivePointers(SelectA->getTrueValue(),
496  SelectB->getTrueValue(), PtrDelta, Depth) &&
497  areConsecutivePointers(SelectA->getFalseValue(),
498  SelectB->getFalseValue(), PtrDelta, Depth);
499  }
500  }
501  return false;
502 }
503 
504 void Vectorizer::reorder(Instruction *I) {
505  OrderedBasicBlock OBB(I->getParent());
506  SmallPtrSet<Instruction *, 16> InstructionsToMove;
508 
509  Worklist.push_back(I);
510  while (!Worklist.empty()) {
511  Instruction *IW = Worklist.pop_back_val();
512  int NumOperands = IW->getNumOperands();
513  for (int i = 0; i < NumOperands; i++) {
515  if (!IM || IM->getOpcode() == Instruction::PHI)
516  continue;
517 
518  // If IM is in another BB, no need to move it, because this pass only
519  // vectorizes instructions within one BB.
520  if (IM->getParent() != I->getParent())
521  continue;
522 
523  if (!OBB.dominates(IM, I)) {
524  InstructionsToMove.insert(IM);
525  Worklist.push_back(IM);
526  }
527  }
528  }
529 
530  // All instructions to move should follow I. Start from I, not from begin().
531  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
532  ++BBI) {
533  if (!InstructionsToMove.count(&*BBI))
534  continue;
535  Instruction *IM = &*BBI;
536  --BBI;
537  IM->removeFromParent();
538  IM->insertBefore(I);
539  }
540 }
541 
542 std::pair<BasicBlock::iterator, BasicBlock::iterator>
543 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
544  Instruction *C0 = Chain[0];
545  BasicBlock::iterator FirstInstr = C0->getIterator();
546  BasicBlock::iterator LastInstr = C0->getIterator();
547 
548  BasicBlock *BB = C0->getParent();
549  unsigned NumFound = 0;
550  for (Instruction &I : *BB) {
551  if (!is_contained(Chain, &I))
552  continue;
553 
554  ++NumFound;
555  if (NumFound == 1) {
556  FirstInstr = I.getIterator();
557  }
558  if (NumFound == Chain.size()) {
559  LastInstr = I.getIterator();
560  break;
561  }
562  }
563 
564  // Range is [first, last).
565  return std::make_pair(FirstInstr, ++LastInstr);
566 }
567 
568 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
570  for (Instruction *I : Chain) {
571  Value *PtrOperand = getLoadStorePointerOperand(I);
572  assert(PtrOperand && "Instruction must have a pointer operand.");
573  Instrs.push_back(I);
574  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
575  Instrs.push_back(GEP);
576  }
577 
578  // Erase instructions.
579  for (Instruction *I : Instrs)
580  if (I->use_empty())
581  I->eraseFromParent();
582 }
583 
584 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
585 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
586  unsigned ElementSizeBits) {
587  unsigned ElementSizeBytes = ElementSizeBits / 8;
588  unsigned SizeBytes = ElementSizeBytes * Chain.size();
589  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
590  if (NumLeft == Chain.size()) {
591  if ((NumLeft & 1) == 0)
592  NumLeft /= 2; // Split even in half
593  else
594  --NumLeft; // Split off last element
595  } else if (NumLeft == 0)
596  NumLeft = 1;
597  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
598 }
599 
601 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
602  // These are in BB order, unlike Chain, which is in address order.
603  SmallVector<Instruction *, 16> MemoryInstrs;
604  SmallVector<Instruction *, 16> ChainInstrs;
605 
606  bool IsLoadChain = isa<LoadInst>(Chain[0]);
607  LLVM_DEBUG({
608  for (Instruction *I : Chain) {
609  if (IsLoadChain)
610  assert(isa<LoadInst>(I) &&
611  "All elements of Chain must be loads, or all must be stores.");
612  else
613  assert(isa<StoreInst>(I) &&
614  "All elements of Chain must be loads, or all must be stores.");
615  }
616  });
617 
618  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
619  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
620  if (!is_contained(Chain, &I))
621  MemoryInstrs.push_back(&I);
622  else
623  ChainInstrs.push_back(&I);
624  } else if (isa<IntrinsicInst>(&I) &&
625  cast<IntrinsicInst>(&I)->getIntrinsicID() ==
626  Intrinsic::sideeffect) {
627  // Ignore llvm.sideeffect calls.
628  } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
629  LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
630  << '\n');
631  break;
632  } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
633  LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
634  << '\n');
635  break;
636  }
637  }
638 
639  OrderedBasicBlock OBB(Chain[0]->getParent());
640 
641  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
642  unsigned ChainInstrIdx = 0;
643  Instruction *BarrierMemoryInstr = nullptr;
644 
645  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
646  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
647 
648  // If a barrier memory instruction was found, chain instructions that follow
649  // will not be added to the valid prefix.
650  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
651  break;
652 
653  // Check (in BB order) if any instruction prevents ChainInstr from being
654  // vectorized. Find and store the first such "conflicting" instruction.
655  for (Instruction *MemInstr : MemoryInstrs) {
656  // If a barrier memory instruction was found, do not check past it.
657  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
658  break;
659 
660  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
661  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
662  if (MemLoad && ChainLoad)
663  continue;
664 
665  // We can ignore the alias if the we have a load store pair and the load
666  // is known to be invariant. The load cannot be clobbered by the store.
667  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
668  return LI->getMetadata(LLVMContext::MD_invariant_load);
669  };
670 
671  // We can ignore the alias as long as the load comes before the store,
672  // because that means we won't be moving the load past the store to
673  // vectorize it (the vectorized load is inserted at the location of the
674  // first load in the chain).
675  if (isa<StoreInst>(MemInstr) && ChainLoad &&
676  (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
677  continue;
678 
679  // Same case, but in reverse.
680  if (MemLoad && isa<StoreInst>(ChainInstr) &&
681  (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
682  continue;
683 
684  if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
685  MemoryLocation::get(ChainInstr))) {
686  LLVM_DEBUG({
687  dbgs() << "LSV: Found alias:\n"
688  " Aliasing instruction and pointer:\n"
689  << " " << *MemInstr << '\n'
690  << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
691  << " Aliased instruction and pointer:\n"
692  << " " << *ChainInstr << '\n'
693  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
694  });
695  // Save this aliasing memory instruction as a barrier, but allow other
696  // instructions that precede the barrier to be vectorized with this one.
697  BarrierMemoryInstr = MemInstr;
698  break;
699  }
700  }
701  // Continue the search only for store chains, since vectorizing stores that
702  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
703  // up to an aliasing store, but should not pull loads from further down in
704  // the basic block.
705  if (IsLoadChain && BarrierMemoryInstr) {
706  // The BarrierMemoryInstr is a store that precedes ChainInstr.
707  assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
708  break;
709  }
710  }
711 
712  // Find the largest prefix of Chain whose elements are all in
713  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
714  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
715  // order.)
716  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
717  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
718  unsigned ChainIdx = 0;
719  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
720  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
721  break;
722  }
723  return Chain.slice(0, ChainIdx);
724 }
725 
726 static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
727  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
728  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
729  // The select's themselves are distinct instructions even if they share the
730  // same condition and evaluate to consecutive pointers for true and false
731  // values of the condition. Therefore using the select's themselves for
732  // grouping instructions would put consecutive accesses into different lists
733  // and they won't be even checked for being consecutive, and won't be
734  // vectorized.
735  return Sel->getCondition();
736  }
737  return ObjPtr;
738 }
739 
740 std::pair<InstrListMap, InstrListMap>
741 Vectorizer::collectInstructions(BasicBlock *BB) {
742  InstrListMap LoadRefs;
743  InstrListMap StoreRefs;
744 
745  for (Instruction &I : *BB) {
746  if (!I.mayReadOrWriteMemory())
747  continue;
748 
749  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
750  if (!LI->isSimple())
751  continue;
752 
753  // Skip if it's not legal.
754  if (!TTI.isLegalToVectorizeLoad(LI))
755  continue;
756 
757  Type *Ty = LI->getType();
759  continue;
760 
761  // Skip weird non-byte sizes. They probably aren't worth the effort of
762  // handling correctly.
763  unsigned TySize = DL.getTypeSizeInBits(Ty);
764  if ((TySize % 8) != 0)
765  continue;
766 
767  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
768  // functions are currently using an integer type for the vectorized
769  // load/store, and does not support casting between the integer type and a
770  // vector of pointers (e.g. i64 to <2 x i16*>)
771  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
772  continue;
773 
774  Value *Ptr = LI->getPointerOperand();
775  unsigned AS = Ptr->getType()->getPointerAddressSpace();
776  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
777 
778  unsigned VF = VecRegSize / TySize;
779  VectorType *VecTy = dyn_cast<VectorType>(Ty);
780 
781  // No point in looking at these if they're too big to vectorize.
782  if (TySize > VecRegSize / 2 ||
783  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
784  continue;
785 
786  // Make sure all the users of a vector are constant-index extracts.
787  if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
789  return EEI && isa<ConstantInt>(EEI->getOperand(1));
790  }))
791  continue;
792 
793  // Save the load locations.
794  const ChainID ID = getChainID(Ptr, DL);
795  LoadRefs[ID].push_back(LI);
796  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
797  if (!SI->isSimple())
798  continue;
799 
800  // Skip if it's not legal.
801  if (!TTI.isLegalToVectorizeStore(SI))
802  continue;
803 
804  Type *Ty = SI->getValueOperand()->getType();
806  continue;
807 
808  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
809  // functions are currently using an integer type for the vectorized
810  // load/store, and does not support casting between the integer type and a
811  // vector of pointers (e.g. i64 to <2 x i16*>)
812  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
813  continue;
814 
815  // Skip weird non-byte sizes. They probably aren't worth the effort of
816  // handling correctly.
817  unsigned TySize = DL.getTypeSizeInBits(Ty);
818  if ((TySize % 8) != 0)
819  continue;
820 
821  Value *Ptr = SI->getPointerOperand();
822  unsigned AS = Ptr->getType()->getPointerAddressSpace();
823  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
824 
825  unsigned VF = VecRegSize / TySize;
826  VectorType *VecTy = dyn_cast<VectorType>(Ty);
827 
828  // No point in looking at these if they're too big to vectorize.
829  if (TySize > VecRegSize / 2 ||
830  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
831  continue;
832 
833  if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
835  return EEI && isa<ConstantInt>(EEI->getOperand(1));
836  }))
837  continue;
838 
839  // Save store location.
840  const ChainID ID = getChainID(Ptr, DL);
841  StoreRefs[ID].push_back(SI);
842  }
843  }
844 
845  return {LoadRefs, StoreRefs};
846 }
847 
848 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
849  bool Changed = false;
850 
851  for (const std::pair<ChainID, InstrList> &Chain : Map) {
852  unsigned Size = Chain.second.size();
853  if (Size < 2)
854  continue;
855 
856  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
857 
858  // Process the stores in chunks of 64.
859  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
860  unsigned Len = std::min<unsigned>(CE - CI, 64);
861  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
862  Changed |= vectorizeInstructions(Chunk);
863  }
864  }
865 
866  return Changed;
867 }
868 
869 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
870  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
871  << " instructions.\n");
872  SmallVector<int, 16> Heads, Tails;
873  int ConsecutiveChain[64];
874 
875  // Do a quadratic search on all of the given loads/stores and find all of the
876  // pairs of loads/stores that follow each other.
877  for (int i = 0, e = Instrs.size(); i < e; ++i) {
878  ConsecutiveChain[i] = -1;
879  for (int j = e - 1; j >= 0; --j) {
880  if (i == j)
881  continue;
882 
883  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
884  if (ConsecutiveChain[i] != -1) {
885  int CurDistance = std::abs(ConsecutiveChain[i] - i);
886  int NewDistance = std::abs(ConsecutiveChain[i] - j);
887  if (j < i || NewDistance > CurDistance)
888  continue; // Should not insert.
889  }
890 
891  Tails.push_back(j);
892  Heads.push_back(i);
893  ConsecutiveChain[i] = j;
894  }
895  }
896  }
897 
898  bool Changed = false;
899  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
900 
901  for (int Head : Heads) {
902  if (InstructionsProcessed.count(Instrs[Head]))
903  continue;
904  bool LongerChainExists = false;
905  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
906  if (Head == Tails[TIt] &&
907  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
908  LongerChainExists = true;
909  break;
910  }
911  if (LongerChainExists)
912  continue;
913 
914  // We found an instr that starts a chain. Now follow the chain and try to
915  // vectorize it.
917  int I = Head;
918  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
919  if (InstructionsProcessed.count(Instrs[I]))
920  break;
921 
922  Operands.push_back(Instrs[I]);
923  I = ConsecutiveChain[I];
924  }
925 
926  bool Vectorized = false;
927  if (isa<LoadInst>(*Operands.begin()))
928  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
929  else
930  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
931 
932  Changed |= Vectorized;
933  }
934 
935  return Changed;
936 }
937 
938 bool Vectorizer::vectorizeStoreChain(
940  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
941  StoreInst *S0 = cast<StoreInst>(Chain[0]);
942 
943  // If the vector has an int element, default to int for the whole store.
944  Type *StoreTy = nullptr;
945  for (Instruction *I : Chain) {
946  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
947  if (StoreTy->isIntOrIntVectorTy())
948  break;
949 
950  if (StoreTy->isPtrOrPtrVectorTy()) {
951  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
952  DL.getTypeSizeInBits(StoreTy));
953  break;
954  }
955  }
956  assert(StoreTy && "Failed to find store type");
957 
958  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
959  unsigned AS = S0->getPointerAddressSpace();
960  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
961  unsigned VF = VecRegSize / Sz;
962  unsigned ChainSize = Chain.size();
963  unsigned Alignment = getAlignment(S0);
964 
965  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
966  InstructionsProcessed->insert(Chain.begin(), Chain.end());
967  return false;
968  }
969 
970  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
971  if (NewChain.empty()) {
972  // No vectorization possible.
973  InstructionsProcessed->insert(Chain.begin(), Chain.end());
974  return false;
975  }
976  if (NewChain.size() == 1) {
977  // Failed after the first instruction. Discard it and try the smaller chain.
978  InstructionsProcessed->insert(NewChain.front());
979  return false;
980  }
981 
982  // Update Chain to the valid vectorizable subchain.
983  Chain = NewChain;
984  ChainSize = Chain.size();
985 
986  // Check if it's legal to vectorize this chain. If not, split the chain and
987  // try again.
988  unsigned EltSzInBytes = Sz / 8;
989  unsigned SzInBytes = EltSzInBytes * ChainSize;
990 
991  VectorType *VecTy;
992  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
993  if (VecStoreTy)
994  VecTy = VectorType::get(StoreTy->getScalarType(),
995  Chain.size() * VecStoreTy->getNumElements());
996  else
997  VecTy = VectorType::get(StoreTy, Chain.size());
998 
999  // If it's more than the max vector size or the target has a better
1000  // vector factor, break it into two pieces.
1001  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1002  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1003  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1004  " Creating two separate arrays.\n");
1005  return vectorizeStoreChain(Chain.slice(0, TargetVF),
1006  InstructionsProcessed) |
1007  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1008  }
1009 
1010  LLVM_DEBUG({
1011  dbgs() << "LSV: Stores to vectorize:\n";
1012  for (Instruction *I : Chain)
1013  dbgs() << " " << *I << "\n";
1014  });
1015 
1016  // We won't try again to vectorize the elements of the chain, regardless of
1017  // whether we succeed below.
1018  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1019 
1020  // If the store is going to be misaligned, don't vectorize it.
1021  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1022  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1023  auto Chains = splitOddVectorElts(Chain, Sz);
1024  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1025  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1026  }
1027 
1028  unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1030  DL, S0, nullptr, &DT);
1031  if (NewAlign != 0)
1032  Alignment = NewAlign;
1033  }
1034 
1035  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1036  auto Chains = splitOddVectorElts(Chain, Sz);
1037  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1038  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1039  }
1040 
1041  BasicBlock::iterator First, Last;
1042  std::tie(First, Last) = getBoundaryInstrs(Chain);
1043  Builder.SetInsertPoint(&*Last);
1044 
1045  Value *Vec = UndefValue::get(VecTy);
1046 
1047  if (VecStoreTy) {
1048  unsigned VecWidth = VecStoreTy->getNumElements();
1049  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1050  StoreInst *Store = cast<StoreInst>(Chain[I]);
1051  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1052  unsigned NewIdx = J + I * VecWidth;
1053  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1054  Builder.getInt32(J));
1055  if (Extract->getType() != StoreTy->getScalarType())
1056  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1057 
1058  Value *Insert =
1059  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1060  Vec = Insert;
1061  }
1062  }
1063  } else {
1064  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1065  StoreInst *Store = cast<StoreInst>(Chain[I]);
1066  Value *Extract = Store->getValueOperand();
1067  if (Extract->getType() != StoreTy->getScalarType())
1068  Extract =
1069  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1070 
1071  Value *Insert =
1072  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1073  Vec = Insert;
1074  }
1075  }
1076 
1077  StoreInst *SI = Builder.CreateAlignedStore(
1078  Vec,
1079  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1080  Alignment);
1081  propagateMetadata(SI, Chain);
1082 
1083  eraseInstructions(Chain);
1084  ++NumVectorInstructions;
1085  NumScalarsVectorized += Chain.size();
1086  return true;
1087 }
1088 
1089 bool Vectorizer::vectorizeLoadChain(
1091  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1092  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1093 
1094  // If the vector has an int element, default to int for the whole load.
1095  Type *LoadTy;
1096  for (const auto &V : Chain) {
1097  LoadTy = cast<LoadInst>(V)->getType();
1098  if (LoadTy->isIntOrIntVectorTy())
1099  break;
1100 
1101  if (LoadTy->isPtrOrPtrVectorTy()) {
1102  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1103  DL.getTypeSizeInBits(LoadTy));
1104  break;
1105  }
1106  }
1107 
1108  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1109  unsigned AS = L0->getPointerAddressSpace();
1110  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1111  unsigned VF = VecRegSize / Sz;
1112  unsigned ChainSize = Chain.size();
1113  unsigned Alignment = getAlignment(L0);
1114 
1115  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1116  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1117  return false;
1118  }
1119 
1120  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1121  if (NewChain.empty()) {
1122  // No vectorization possible.
1123  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1124  return false;
1125  }
1126  if (NewChain.size() == 1) {
1127  // Failed after the first instruction. Discard it and try the smaller chain.
1128  InstructionsProcessed->insert(NewChain.front());
1129  return false;
1130  }
1131 
1132  // Update Chain to the valid vectorizable subchain.
1133  Chain = NewChain;
1134  ChainSize = Chain.size();
1135 
1136  // Check if it's legal to vectorize this chain. If not, split the chain and
1137  // try again.
1138  unsigned EltSzInBytes = Sz / 8;
1139  unsigned SzInBytes = EltSzInBytes * ChainSize;
1140  VectorType *VecTy;
1141  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1142  if (VecLoadTy)
1143  VecTy = VectorType::get(LoadTy->getScalarType(),
1144  Chain.size() * VecLoadTy->getNumElements());
1145  else
1146  VecTy = VectorType::get(LoadTy, Chain.size());
1147 
1148  // If it's more than the max vector size or the target has a better
1149  // vector factor, break it into two pieces.
1150  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1151  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1152  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1153  " Creating two separate arrays.\n");
1154  return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1155  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1156  }
1157 
1158  // We won't try again to vectorize the elements of the chain, regardless of
1159  // whether we succeed below.
1160  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1161 
1162  // If the load is going to be misaligned, don't vectorize it.
1163  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1164  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1165  auto Chains = splitOddVectorElts(Chain, Sz);
1166  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1167  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1168  }
1169 
1170  Alignment = getOrEnforceKnownAlignment(
1171  L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
1172  }
1173 
1174  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1175  auto Chains = splitOddVectorElts(Chain, Sz);
1176  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1177  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1178  }
1179 
1180  LLVM_DEBUG({
1181  dbgs() << "LSV: Loads to vectorize:\n";
1182  for (Instruction *I : Chain)
1183  I->dump();
1184  });
1185 
1186  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1187  // Last may have changed since then, but the value of First won't have. If it
1188  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1189  BasicBlock::iterator First, Last;
1190  std::tie(First, Last) = getBoundaryInstrs(Chain);
1191  Builder.SetInsertPoint(&*First);
1192 
1193  Value *Bitcast =
1194  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1195  LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
1196  propagateMetadata(LI, Chain);
1197 
1198  if (VecLoadTy) {
1199  SmallVector<Instruction *, 16> InstrsToErase;
1200 
1201  unsigned VecWidth = VecLoadTy->getNumElements();
1202  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1203  for (auto Use : Chain[I]->users()) {
1204  // All users of vector loads are ExtractElement instructions with
1205  // constant indices, otherwise we would have bailed before now.
1206  Instruction *UI = cast<Instruction>(Use);
1207  unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1208  unsigned NewIdx = Idx + I * VecWidth;
1209  Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1210  UI->getName());
1211  if (V->getType() != UI->getType())
1212  V = Builder.CreateBitCast(V, UI->getType());
1213 
1214  // Replace the old instruction.
1215  UI->replaceAllUsesWith(V);
1216  InstrsToErase.push_back(UI);
1217  }
1218  }
1219 
1220  // Bitcast might not be an Instruction, if the value being loaded is a
1221  // constant. In that case, no need to reorder anything.
1222  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1223  reorder(BitcastInst);
1224 
1225  for (auto I : InstrsToErase)
1226  I->eraseFromParent();
1227  } else {
1228  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1229  Value *CV = Chain[I];
1230  Value *V =
1231  Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1232  if (V->getType() != CV->getType()) {
1233  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1234  }
1235 
1236  // Replace the old instruction.
1237  CV->replaceAllUsesWith(V);
1238  }
1239 
1240  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1241  reorder(BitcastInst);
1242  }
1243 
1244  eraseInstructions(Chain);
1245 
1246  ++NumVectorInstructions;
1247  NumScalarsVectorized += Chain.size();
1248  return true;
1249 }
1250 
1251 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1252  unsigned Alignment) {
1253  if (Alignment % SzInBytes == 0)
1254  return false;
1255 
1256  bool Fast = false;
1257  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1258  SzInBytes * 8, AddressSpace,
1259  Alignment, &Fast);
1260  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1261  << " and fast? " << Fast << "\n";);
1262  return !Allows || !Fast;
1263 }
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:80
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:151
uint64_t CallInst * C
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Value * getValueOperand()
Definition: Instructions.h:409
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1181
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:776
This class represents lattice values for constants.
Definition: AllocatorList.h:23
iterator begin() const
Definition: ArrayRef.h:136
void push_back(const T &Elt)
Definition: SmallVector.h:211
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal, MD_access_group].
The main scalar evolution driver.
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:860
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1524
bool dominates(const Instruction *A, const Instruction *B)
Find out whether A dominates B, meaning whether A comes before B in BB.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
Analysis pass providing the TargetTransformInfo.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:37
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1177
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:813
STATISTIC(NumFunctions, "Total number of functions")
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:230
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:580
An instruction for reading from memory.
Definition: Instructions.h:167
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:229
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1238
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1515
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4428
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
static uint32_t getAlignment(const MCSectionCOFF &Sec)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Type * getPointerElementType() const
Definition: Type.h:376
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
APInt zextOrSelf(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:894
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
LLVMContext & getContext() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:779
This file contains the simple types necessary to represent the attributes associated with functions a...
uint64_t getNumElements() const
For scalable vectors, this will return the minimum number of elements in the vector.
Definition: DerivedTypes.h:393
This file implements a class to represent arbitrary precision integral constant values and operations...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:418
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1461
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:886
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:623
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:234
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:125
Value * getLoadStorePointerOperand(Value *V)
A helper function that returns the pointer operand of a load or store instruction.
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:602
An instruction for storing to memory.
Definition: Instructions.h:320
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
Value * getOperand(unsigned i) const
Definition: User.h:169
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:303
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:875
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:363
static bool runOnFunction(Function &F, bool PostInlining)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1617
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:428
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:73
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
static const unsigned StackAdjustedAlignment
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1184
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define DEBUG_TYPE
This file contains the declarations for the subclasses of Constant, which represent the different fla...
A manager for alias analyses.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool mayThrow() const
Return true if this instruction may throw an exception.
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:442
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Value * getPointerOperand()
Definition: Instructions.h:284
iterator_range< po_iterator< T > > post_order(const T &G)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static bool hasNoUnsignedWrap(BinaryOperator &I)
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1433
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
size_t size() const
Definition: SmallVector.h:52
static wasm::ValType getType(const TargetRegisterClass *RC)
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
const unsigned MaxDepth
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:226
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:191
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static unsigned getIntrinsicID(const SDNode *N)
Legacy wrapper pass to provide the SCEVAAResult object.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:270
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:129
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
Module.h This file contains the declarations for the Module class.
AddressSpace
Definition: NVPTXBaseInfo.h:21
iterator end() const
Definition: ArrayRef.h:137
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:752
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:374
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:179
void negate()
Negate this APInt in place.
Definition: APInt.h:1499
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class to represent vector types.
Definition: DerivedTypes.h:427
Class for arbitrary precision integers.
Definition: APInt.h:69
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Definition: ArrayRef.h:178
Represents analyses that only rely on functions&#39; control flow.
Definition: PassManager.h:114
void removeFromParent()
This method unlinks &#39;this&#39; from the containing basic block, but does not delete it.
Definition: Instruction.cpp:63
Analysis pass that exposes the ScalarEvolution for a function.
iv users
Definition: IVUsers.cpp:51
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:240
This class represents an analyzed expression in the program.
static ChainID getChainID(const Value *Ptr, const DataLayout &DL)
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
This file provides utility analysis objects describing memory locations.
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:609
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:189
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1223
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
This instruction extracts a single (scalar) element from a VectorType value.
uint32_t Size
Definition: Profile.cpp:46
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:365
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:290
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned getMinSignedBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1558
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:73
static const Function * getParent(const Value *V)
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
inst_range instructions(Function *F)
Definition: InstIterator.h:133
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
This pass exposes codegen information to IR-level passes.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:122
Value * getPointerOperand()
Definition: Instructions.h:412
bool use_empty() const
Definition: Value.h:342
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:143
const BasicBlock * getParent() const
Definition: Instruction.h:66
gep_type_iterator gep_type_begin(const User *GEP)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1236
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:532