LLVM  10.0.0svn
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12 //
13 // (For simplicity below we talk about loads only, but everything also applies
14 // to stores.)
15 //
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
20 //
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
24 // register.
25 //
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
30 //
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
35 //
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
40 
41 #include "llvm/ADT/APInt.h"
42 #include "llvm/ADT/ArrayRef.h"
43 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/STLExtras.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
106 /// be relied upon.
107 using ChainID = const Value *;
108 using InstrList = SmallVector<Instruction *, 8>;
109 using InstrListMap = MapVector<ChainID, InstrList>;
110 
111 class Vectorizer {
112  Function &F;
113  AliasAnalysis &AA;
114  DominatorTree &DT;
115  ScalarEvolution &SE;
116  TargetTransformInfo &TTI;
117  const DataLayout &DL;
118  IRBuilder<> Builder;
119 
120 public:
121  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
123  : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
124  DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
125 
126  bool run();
127 
128 private:
129  unsigned getPointerAddressSpace(Value *I);
130 
131  unsigned getAlignment(LoadInst *LI) const {
132  unsigned Align = LI->getAlignment();
133  if (Align != 0)
134  return Align;
135 
136  return DL.getABITypeAlignment(LI->getType());
137  }
138 
139  unsigned getAlignment(StoreInst *SI) const {
140  unsigned Align = SI->getAlignment();
141  if (Align != 0)
142  return Align;
143 
144  return DL.getABITypeAlignment(SI->getValueOperand()->getType());
145  }
146 
147  static const unsigned MaxDepth = 3;
148 
149  bool isConsecutiveAccess(Value *A, Value *B);
150  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
151  unsigned Depth = 0) const;
152  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
153  unsigned Depth) const;
154  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
155  unsigned Depth) const;
156 
157  /// After vectorization, reorder the instructions that I depends on
158  /// (the instructions defining its operands), to ensure they dominate I.
159  void reorder(Instruction *I);
160 
161  /// Returns the first and the last instructions in Chain.
162  std::pair<BasicBlock::iterator, BasicBlock::iterator>
163  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
164 
165  /// Erases the original instructions after vectorizing.
166  void eraseInstructions(ArrayRef<Instruction *> Chain);
167 
168  /// "Legalize" the vector type that would be produced by combining \p
169  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171  /// expected to have more than 4 elements.
172  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
173  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
174 
175  /// Finds the largest prefix of Chain that's vectorizable, checking for
176  /// intervening instructions which may affect the memory accessed by the
177  /// instructions within Chain.
178  ///
179  /// The elements of \p Chain must be all loads or all stores and must be in
180  /// address order.
181  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
182 
183  /// Collects load and store instructions to vectorize.
184  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
185 
186  /// Processes the collected instructions, the \p Map. The values of \p Map
187  /// should be all loads or all stores.
188  bool vectorizeChains(InstrListMap &Map);
189 
190  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
192 
193  /// Vectorizes the load instructions in Chain.
194  bool
195  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
196  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
197 
198  /// Vectorizes the store instructions in Chain.
199  bool
200  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
201  SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
202 
203  /// Check if this load/store access is misaligned accesses.
204  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
205  unsigned Alignment);
206 };
207 
208 class LoadStoreVectorizerLegacyPass : public FunctionPass {
209 public:
210  static char ID;
211 
212  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
214  }
215 
216  bool runOnFunction(Function &F) override;
217 
218  StringRef getPassName() const override {
219  return "GPU Load and Store Vectorizer";
220  }
221 
222  void getAnalysisUsage(AnalysisUsage &AU) const override {
227  AU.setPreservesCFG();
228  }
229 };
230 
231 } // end anonymous namespace
232 
234 
235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
236  "Vectorize load and Store instructions", false, false)
242 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
243  "Vectorize load and store instructions", false, false)
244 
246  return new LoadStoreVectorizerLegacyPass();
247 }
248 
250  // Don't vectorize when the attribute NoImplicitFloat is used.
251  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
252  return false;
253 
254  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
255  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
256  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
257  TargetTransformInfo &TTI =
258  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
259 
260  Vectorizer V(F, AA, DT, SE, TTI);
261  return V.run();
262 }
263 
265  // Don't vectorize when the attribute NoImplicitFloat is used.
266  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
267  return PreservedAnalyses::all();
268 
269  AliasAnalysis &AA = AM.getResult<AAManager>(F);
273 
274  Vectorizer V(F, AA, DT, SE, TTI);
275  bool Changed = V.run();
277  PA.preserveSet<CFGAnalyses>();
278  return Changed ? PA : PreservedAnalyses::all();
279 }
280 
281 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
282 // vectors of Instructions.
284  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
285  propagateMetadata(I, VL);
286 }
287 
288 // Vectorizer Implementation
289 bool Vectorizer::run() {
290  bool Changed = false;
291 
292  // Scan the blocks in the function in post order.
293  for (BasicBlock *BB : post_order(&F)) {
294  InstrListMap LoadRefs, StoreRefs;
295  std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
296  Changed |= vectorizeChains(LoadRefs);
297  Changed |= vectorizeChains(StoreRefs);
298  }
299 
300  return Changed;
301 }
302 
303 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
304  if (LoadInst *L = dyn_cast<LoadInst>(I))
305  return L->getPointerAddressSpace();
306  if (StoreInst *S = dyn_cast<StoreInst>(I))
307  return S->getPointerAddressSpace();
308  return -1;
309 }
310 
311 // FIXME: Merge with llvm::isConsecutiveAccess
315  unsigned ASA = getPointerAddressSpace(A);
316  unsigned ASB = getPointerAddressSpace(B);
317 
318  // Check that the address spaces match and that the pointers are valid.
319  if (!PtrA || !PtrB || (ASA != ASB))
320  return false;
321 
322  // Make sure that A and B are different pointers of the same size type.
323  Type *PtrATy = PtrA->getType()->getPointerElementType();
324  Type *PtrBTy = PtrB->getType()->getPointerElementType();
325  if (PtrA == PtrB ||
326  PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
327  DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
328  DL.getTypeStoreSize(PtrATy->getScalarType()) !=
329  DL.getTypeStoreSize(PtrBTy->getScalarType()))
330  return false;
331 
332  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
333  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
334 
335  return areConsecutivePointers(PtrA, PtrB, Size);
336 }
337 
338 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
339  APInt PtrDelta, unsigned Depth) const {
340  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
341  APInt OffsetA(PtrBitWidth, 0);
342  APInt OffsetB(PtrBitWidth, 0);
343  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
344  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
345 
346  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
347 
348  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
349  return false;
350 
351  // In case if we have to shrink the pointer
352  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
353  // possible overflow and the value should fit into a smallest data type
354  // used in the cast/gep chain.
355  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
356  OffsetB.getMinSignedBits() <= NewPtrBitWidth);
357 
358  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
359  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
360  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
361 
362  APInt OffsetDelta = OffsetB - OffsetA;
363 
364  // Check if they are based on the same pointer. That makes the offsets
365  // sufficient.
366  if (PtrA == PtrB)
367  return OffsetDelta == PtrDelta;
368 
369  // Compute the necessary base pointer delta to have the necessary final delta
370  // equal to the pointer delta requested.
371  APInt BaseDelta = PtrDelta - OffsetDelta;
372 
373  // Compute the distance with SCEV between the base pointers.
374  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
375  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
376  const SCEV *C = SE.getConstant(BaseDelta);
377  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
378  if (X == PtrSCEVB)
379  return true;
380 
381  // The above check will not catch the cases where one of the pointers is
382  // factorized but the other one is not, such as (C + (S * (A + B))) vs
383  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
384  // and getting the simplified difference.
385  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
386  if (C == Dist)
387  return true;
388 
389  // Sometimes even this doesn't work, because SCEV can't always see through
390  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
391  // things the hard way.
392  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
393 }
394 
395 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
396  APInt PtrDelta,
397  unsigned Depth) const {
398  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
399  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
400  if (!GEPA || !GEPB)
401  return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
402 
403  // Look through GEPs after checking they're the same except for the last
404  // index.
405  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
406  GEPA->getPointerOperand() != GEPB->getPointerOperand())
407  return false;
408  gep_type_iterator GTIA = gep_type_begin(GEPA);
409  gep_type_iterator GTIB = gep_type_begin(GEPB);
410  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
411  if (GTIA.getOperand() != GTIB.getOperand())
412  return false;
413  ++GTIA;
414  ++GTIB;
415  }
416 
417  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
418  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
419  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
420  OpA->getType() != OpB->getType())
421  return false;
422 
423  if (PtrDelta.isNegative()) {
424  if (PtrDelta.isMinSignedValue())
425  return false;
426  PtrDelta.negate();
427  std::swap(OpA, OpB);
428  }
429  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
430  if (PtrDelta.urem(Stride) != 0)
431  return false;
432  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
433  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
434 
435  // Only look through a ZExt/SExt.
436  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
437  return false;
438 
439  bool Signed = isa<SExtInst>(OpA);
440 
441  // At this point A could be a function parameter, i.e. not an instruction
442  Value *ValA = OpA->getOperand(0);
443  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
444  if (!OpB || ValA->getType() != OpB->getType())
445  return false;
446 
447  // Now we need to prove that adding IdxDiff to ValA won't overflow.
448  bool Safe = false;
449  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
450  // ValA, we're okay.
451  if (OpB->getOpcode() == Instruction::Add &&
452  isa<ConstantInt>(OpB->getOperand(1)) &&
453  IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
454  if (Signed)
455  Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
456  else
457  Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
458  }
459 
460  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
461 
462  // Second attempt:
463  // If all set bits of IdxDiff or any higher order bit other than the sign bit
464  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
465  // overflow of any sort.
466  if (!Safe) {
467  OpA = dyn_cast<Instruction>(ValA);
468  if (!OpA)
469  return false;
470  KnownBits Known(BitWidth);
471  computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
472  APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
473  if (Signed)
474  BitsAllowedToBeSet.clearBit(BitWidth - 1);
475  if (BitsAllowedToBeSet.ult(IdxDiff))
476  return false;
477  }
478 
479  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
480  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
481  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
482  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
483  return X == OffsetSCEVB;
484 }
485 
486 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
487  const APInt &PtrDelta,
488  unsigned Depth) const {
489  if (Depth++ == MaxDepth)
490  return false;
491 
492  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
493  if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
494  return SelectA->getCondition() == SelectB->getCondition() &&
495  areConsecutivePointers(SelectA->getTrueValue(),
496  SelectB->getTrueValue(), PtrDelta, Depth) &&
497  areConsecutivePointers(SelectA->getFalseValue(),
498  SelectB->getFalseValue(), PtrDelta, Depth);
499  }
500  }
501  return false;
502 }
503 
504 void Vectorizer::reorder(Instruction *I) {
505  OrderedBasicBlock OBB(I->getParent());
506  SmallPtrSet<Instruction *, 16> InstructionsToMove;
508 
509  Worklist.push_back(I);
510  while (!Worklist.empty()) {
511  Instruction *IW = Worklist.pop_back_val();
512  int NumOperands = IW->getNumOperands();
513  for (int i = 0; i < NumOperands; i++) {
515  if (!IM || IM->getOpcode() == Instruction::PHI)
516  continue;
517 
518  // If IM is in another BB, no need to move it, because this pass only
519  // vectorizes instructions within one BB.
520  if (IM->getParent() != I->getParent())
521  continue;
522 
523  if (!OBB.dominates(IM, I)) {
524  InstructionsToMove.insert(IM);
525  Worklist.push_back(IM);
526  }
527  }
528  }
529 
530  // All instructions to move should follow I. Start from I, not from begin().
531  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
532  ++BBI) {
533  if (!InstructionsToMove.count(&*BBI))
534  continue;
535  Instruction *IM = &*BBI;
536  --BBI;
537  IM->removeFromParent();
538  IM->insertBefore(I);
539  }
540 }
541 
542 std::pair<BasicBlock::iterator, BasicBlock::iterator>
543 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
544  Instruction *C0 = Chain[0];
545  BasicBlock::iterator FirstInstr = C0->getIterator();
546  BasicBlock::iterator LastInstr = C0->getIterator();
547 
548  BasicBlock *BB = C0->getParent();
549  unsigned NumFound = 0;
550  for (Instruction &I : *BB) {
551  if (!is_contained(Chain, &I))
552  continue;
553 
554  ++NumFound;
555  if (NumFound == 1) {
556  FirstInstr = I.getIterator();
557  }
558  if (NumFound == Chain.size()) {
559  LastInstr = I.getIterator();
560  break;
561  }
562  }
563 
564  // Range is [first, last).
565  return std::make_pair(FirstInstr, ++LastInstr);
566 }
567 
568 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
570  for (Instruction *I : Chain) {
571  Value *PtrOperand = getLoadStorePointerOperand(I);
572  assert(PtrOperand && "Instruction must have a pointer operand.");
573  Instrs.push_back(I);
574  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
575  Instrs.push_back(GEP);
576  }
577 
578  // Erase instructions.
579  for (Instruction *I : Instrs)
580  if (I->use_empty())
581  I->eraseFromParent();
582 }
583 
584 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
585 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
586  unsigned ElementSizeBits) {
587  unsigned ElementSizeBytes = ElementSizeBits / 8;
588  unsigned SizeBytes = ElementSizeBytes * Chain.size();
589  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
590  if (NumLeft == Chain.size()) {
591  if ((NumLeft & 1) == 0)
592  NumLeft /= 2; // Split even in half
593  else
594  --NumLeft; // Split off last element
595  } else if (NumLeft == 0)
596  NumLeft = 1;
597  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
598 }
599 
601 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
602  // These are in BB order, unlike Chain, which is in address order.
603  SmallVector<Instruction *, 16> MemoryInstrs;
604  SmallVector<Instruction *, 16> ChainInstrs;
605 
606  bool IsLoadChain = isa<LoadInst>(Chain[0]);
607  LLVM_DEBUG({
608  for (Instruction *I : Chain) {
609  if (IsLoadChain)
610  assert(isa<LoadInst>(I) &&
611  "All elements of Chain must be loads, or all must be stores.");
612  else
613  assert(isa<StoreInst>(I) &&
614  "All elements of Chain must be loads, or all must be stores.");
615  }
616  });
617 
618  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
619  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
620  if (!is_contained(Chain, &I))
621  MemoryInstrs.push_back(&I);
622  else
623  ChainInstrs.push_back(&I);
624  } else if (isa<IntrinsicInst>(&I) &&
625  cast<IntrinsicInst>(&I)->getIntrinsicID() ==
626  Intrinsic::sideeffect) {
627  // Ignore llvm.sideeffect calls.
628  } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
629  LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
630  << '\n');
631  break;
632  } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
633  LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
634  << '\n');
635  break;
636  }
637  }
638 
639  OrderedBasicBlock OBB(Chain[0]->getParent());
640 
641  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
642  unsigned ChainInstrIdx = 0;
643  Instruction *BarrierMemoryInstr = nullptr;
644 
645  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
646  Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
647 
648  // If a barrier memory instruction was found, chain instructions that follow
649  // will not be added to the valid prefix.
650  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
651  break;
652 
653  // Check (in BB order) if any instruction prevents ChainInstr from being
654  // vectorized. Find and store the first such "conflicting" instruction.
655  for (Instruction *MemInstr : MemoryInstrs) {
656  // If a barrier memory instruction was found, do not check past it.
657  if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
658  break;
659 
660  auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
661  auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
662  if (MemLoad && ChainLoad)
663  continue;
664 
665  // We can ignore the alias if the we have a load store pair and the load
666  // is known to be invariant. The load cannot be clobbered by the store.
667  auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
668  return LI->hasMetadata(LLVMContext::MD_invariant_load);
669  };
670 
671  // We can ignore the alias as long as the load comes before the store,
672  // because that means we won't be moving the load past the store to
673  // vectorize it (the vectorized load is inserted at the location of the
674  // first load in the chain).
675  if (isa<StoreInst>(MemInstr) && ChainLoad &&
676  (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
677  continue;
678 
679  // Same case, but in reverse.
680  if (MemLoad && isa<StoreInst>(ChainInstr) &&
681  (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
682  continue;
683 
684  if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
685  MemoryLocation::get(ChainInstr))) {
686  LLVM_DEBUG({
687  dbgs() << "LSV: Found alias:\n"
688  " Aliasing instruction and pointer:\n"
689  << " " << *MemInstr << '\n'
690  << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
691  << " Aliased instruction and pointer:\n"
692  << " " << *ChainInstr << '\n'
693  << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
694  });
695  // Save this aliasing memory instruction as a barrier, but allow other
696  // instructions that precede the barrier to be vectorized with this one.
697  BarrierMemoryInstr = MemInstr;
698  break;
699  }
700  }
701  // Continue the search only for store chains, since vectorizing stores that
702  // precede an aliasing load is valid. Conversely, vectorizing loads is valid
703  // up to an aliasing store, but should not pull loads from further down in
704  // the basic block.
705  if (IsLoadChain && BarrierMemoryInstr) {
706  // The BarrierMemoryInstr is a store that precedes ChainInstr.
707  assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
708  break;
709  }
710  }
711 
712  // Find the largest prefix of Chain whose elements are all in
713  // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
714  // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
715  // order.)
716  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
717  ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
718  unsigned ChainIdx = 0;
719  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
720  if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
721  break;
722  }
723  return Chain.slice(0, ChainIdx);
724 }
725 
726 static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
727  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
728  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
729  // The select's themselves are distinct instructions even if they share the
730  // same condition and evaluate to consecutive pointers for true and false
731  // values of the condition. Therefore using the select's themselves for
732  // grouping instructions would put consecutive accesses into different lists
733  // and they won't be even checked for being consecutive, and won't be
734  // vectorized.
735  return Sel->getCondition();
736  }
737  return ObjPtr;
738 }
739 
740 std::pair<InstrListMap, InstrListMap>
741 Vectorizer::collectInstructions(BasicBlock *BB) {
742  InstrListMap LoadRefs;
743  InstrListMap StoreRefs;
744 
745  for (Instruction &I : *BB) {
746  if (!I.mayReadOrWriteMemory())
747  continue;
748 
749  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
750  if (!LI->isSimple())
751  continue;
752 
753  // Skip if it's not legal.
754  if (!TTI.isLegalToVectorizeLoad(LI))
755  continue;
756 
757  Type *Ty = LI->getType();
759  continue;
760 
761  // Skip weird non-byte sizes. They probably aren't worth the effort of
762  // handling correctly.
763  unsigned TySize = DL.getTypeSizeInBits(Ty);
764  if ((TySize % 8) != 0)
765  continue;
766 
767  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
768  // functions are currently using an integer type for the vectorized
769  // load/store, and does not support casting between the integer type and a
770  // vector of pointers (e.g. i64 to <2 x i16*>)
771  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
772  continue;
773 
774  Value *Ptr = LI->getPointerOperand();
775  unsigned AS = Ptr->getType()->getPointerAddressSpace();
776  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
777 
778  unsigned VF = VecRegSize / TySize;
779  VectorType *VecTy = dyn_cast<VectorType>(Ty);
780 
781  // No point in looking at these if they're too big to vectorize.
782  if (TySize > VecRegSize / 2 ||
783  (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
784  continue;
785 
786  // Make sure all the users of a vector are constant-index extracts.
787  if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
789  return EEI && isa<ConstantInt>(EEI->getOperand(1));
790  }))
791  continue;
792 
793  // Save the load locations.
794  const ChainID ID = getChainID(Ptr, DL);
795  LoadRefs[ID].push_back(LI);
796  } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
797  if (!SI->isSimple())
798  continue;
799 
800  // Skip if it's not legal.
801  if (!TTI.isLegalToVectorizeStore(SI))
802  continue;
803 
804  Type *Ty = SI->getValueOperand()->getType();
806  continue;
807 
808  // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
809  // functions are currently using an integer type for the vectorized
810  // load/store, and does not support casting between the integer type and a
811  // vector of pointers (e.g. i64 to <2 x i16*>)
812  if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
813  continue;
814 
815  // Skip weird non-byte sizes. They probably aren't worth the effort of
816  // handling correctly.
817  unsigned TySize = DL.getTypeSizeInBits(Ty);
818  if ((TySize % 8) != 0)
819  continue;
820 
821  Value *Ptr = SI->getPointerOperand();
822  unsigned AS = Ptr->getType()->getPointerAddressSpace();
823  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
824 
825  unsigned VF = VecRegSize / TySize;
826  VectorType *VecTy = dyn_cast<VectorType>(Ty);
827 
828  // No point in looking at these if they're too big to vectorize.
829  if (TySize > VecRegSize / 2 ||
830  (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
831  continue;
832 
833  if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
835  return EEI && isa<ConstantInt>(EEI->getOperand(1));
836  }))
837  continue;
838 
839  // Save store location.
840  const ChainID ID = getChainID(Ptr, DL);
841  StoreRefs[ID].push_back(SI);
842  }
843  }
844 
845  return {LoadRefs, StoreRefs};
846 }
847 
848 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
849  bool Changed = false;
850 
851  for (const std::pair<ChainID, InstrList> &Chain : Map) {
852  unsigned Size = Chain.second.size();
853  if (Size < 2)
854  continue;
855 
856  LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
857 
858  // Process the stores in chunks of 64.
859  for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
860  unsigned Len = std::min<unsigned>(CE - CI, 64);
861  ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
862  Changed |= vectorizeInstructions(Chunk);
863  }
864  }
865 
866  return Changed;
867 }
868 
869 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
870  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
871  << " instructions.\n");
872  SmallVector<int, 16> Heads, Tails;
873  int ConsecutiveChain[64];
874 
875  // Do a quadratic search on all of the given loads/stores and find all of the
876  // pairs of loads/stores that follow each other.
877  for (int i = 0, e = Instrs.size(); i < e; ++i) {
878  ConsecutiveChain[i] = -1;
879  for (int j = e - 1; j >= 0; --j) {
880  if (i == j)
881  continue;
882 
883  if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
884  if (ConsecutiveChain[i] != -1) {
885  int CurDistance = std::abs(ConsecutiveChain[i] - i);
886  int NewDistance = std::abs(ConsecutiveChain[i] - j);
887  if (j < i || NewDistance > CurDistance)
888  continue; // Should not insert.
889  }
890 
891  Tails.push_back(j);
892  Heads.push_back(i);
893  ConsecutiveChain[i] = j;
894  }
895  }
896  }
897 
898  bool Changed = false;
899  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
900 
901  for (int Head : Heads) {
902  if (InstructionsProcessed.count(Instrs[Head]))
903  continue;
904  bool LongerChainExists = false;
905  for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
906  if (Head == Tails[TIt] &&
907  !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
908  LongerChainExists = true;
909  break;
910  }
911  if (LongerChainExists)
912  continue;
913 
914  // We found an instr that starts a chain. Now follow the chain and try to
915  // vectorize it.
917  int I = Head;
918  while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
919  if (InstructionsProcessed.count(Instrs[I]))
920  break;
921 
922  Operands.push_back(Instrs[I]);
923  I = ConsecutiveChain[I];
924  }
925 
926  bool Vectorized = false;
927  if (isa<LoadInst>(*Operands.begin()))
928  Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
929  else
930  Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
931 
932  Changed |= Vectorized;
933  }
934 
935  return Changed;
936 }
937 
938 bool Vectorizer::vectorizeStoreChain(
940  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
941  StoreInst *S0 = cast<StoreInst>(Chain[0]);
942 
943  // If the vector has an int element, default to int for the whole store.
944  Type *StoreTy = nullptr;
945  for (Instruction *I : Chain) {
946  StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
947  if (StoreTy->isIntOrIntVectorTy())
948  break;
949 
950  if (StoreTy->isPtrOrPtrVectorTy()) {
951  StoreTy = Type::getIntNTy(F.getParent()->getContext(),
952  DL.getTypeSizeInBits(StoreTy));
953  break;
954  }
955  }
956  assert(StoreTy && "Failed to find store type");
957 
958  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
959  unsigned AS = S0->getPointerAddressSpace();
960  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
961  unsigned VF = VecRegSize / Sz;
962  unsigned ChainSize = Chain.size();
963  unsigned Alignment = getAlignment(S0);
964 
965  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
966  InstructionsProcessed->insert(Chain.begin(), Chain.end());
967  return false;
968  }
969 
970  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
971  if (NewChain.empty()) {
972  // No vectorization possible.
973  InstructionsProcessed->insert(Chain.begin(), Chain.end());
974  return false;
975  }
976  if (NewChain.size() == 1) {
977  // Failed after the first instruction. Discard it and try the smaller chain.
978  InstructionsProcessed->insert(NewChain.front());
979  return false;
980  }
981 
982  // Update Chain to the valid vectorizable subchain.
983  Chain = NewChain;
984  ChainSize = Chain.size();
985 
986  // Check if it's legal to vectorize this chain. If not, split the chain and
987  // try again.
988  unsigned EltSzInBytes = Sz / 8;
989  unsigned SzInBytes = EltSzInBytes * ChainSize;
990 
991  VectorType *VecTy;
992  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
993  if (VecStoreTy)
994  VecTy = VectorType::get(StoreTy->getScalarType(),
995  Chain.size() * VecStoreTy->getNumElements());
996  else
997  VecTy = VectorType::get(StoreTy, Chain.size());
998 
999  // If it's more than the max vector size or the target has a better
1000  // vector factor, break it into two pieces.
1001  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1002  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1003  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1004  " Creating two separate arrays.\n");
1005  return vectorizeStoreChain(Chain.slice(0, TargetVF),
1006  InstructionsProcessed) |
1007  vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1008  }
1009 
1010  LLVM_DEBUG({
1011  dbgs() << "LSV: Stores to vectorize:\n";
1012  for (Instruction *I : Chain)
1013  dbgs() << " " << *I << "\n";
1014  });
1015 
1016  // We won't try again to vectorize the elements of the chain, regardless of
1017  // whether we succeed below.
1018  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1019 
1020  // If the store is going to be misaligned, don't vectorize it.
1021  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1022  if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1023  auto Chains = splitOddVectorElts(Chain, Sz);
1024  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1025  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1026  }
1027 
1028  unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1030  DL, S0, nullptr, &DT);
1031  if (NewAlign != 0)
1032  Alignment = NewAlign;
1033  }
1034 
1035  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1036  auto Chains = splitOddVectorElts(Chain, Sz);
1037  return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1038  vectorizeStoreChain(Chains.second, InstructionsProcessed);
1039  }
1040 
1041  BasicBlock::iterator First, Last;
1042  std::tie(First, Last) = getBoundaryInstrs(Chain);
1043  Builder.SetInsertPoint(&*Last);
1044 
1045  Value *Vec = UndefValue::get(VecTy);
1046 
1047  if (VecStoreTy) {
1048  unsigned VecWidth = VecStoreTy->getNumElements();
1049  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1050  StoreInst *Store = cast<StoreInst>(Chain[I]);
1051  for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1052  unsigned NewIdx = J + I * VecWidth;
1053  Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1054  Builder.getInt32(J));
1055  if (Extract->getType() != StoreTy->getScalarType())
1056  Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1057 
1058  Value *Insert =
1059  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1060  Vec = Insert;
1061  }
1062  }
1063  } else {
1064  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1065  StoreInst *Store = cast<StoreInst>(Chain[I]);
1066  Value *Extract = Store->getValueOperand();
1067  if (Extract->getType() != StoreTy->getScalarType())
1068  Extract =
1069  Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1070 
1071  Value *Insert =
1072  Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1073  Vec = Insert;
1074  }
1075  }
1076 
1077  StoreInst *SI = Builder.CreateAlignedStore(
1078  Vec,
1079  Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1080  Alignment);
1081  propagateMetadata(SI, Chain);
1082 
1083  eraseInstructions(Chain);
1084  ++NumVectorInstructions;
1085  NumScalarsVectorized += Chain.size();
1086  return true;
1087 }
1088 
1089 bool Vectorizer::vectorizeLoadChain(
1091  SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1092  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1093 
1094  // If the vector has an int element, default to int for the whole load.
1095  Type *LoadTy = nullptr;
1096  for (const auto &V : Chain) {
1097  LoadTy = cast<LoadInst>(V)->getType();
1098  if (LoadTy->isIntOrIntVectorTy())
1099  break;
1100 
1101  if (LoadTy->isPtrOrPtrVectorTy()) {
1102  LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1103  DL.getTypeSizeInBits(LoadTy));
1104  break;
1105  }
1106  }
1107  assert(LoadTy && "Can't determine LoadInst type from chain");
1108 
1109  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1110  unsigned AS = L0->getPointerAddressSpace();
1111  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1112  unsigned VF = VecRegSize / Sz;
1113  unsigned ChainSize = Chain.size();
1114  unsigned Alignment = getAlignment(L0);
1115 
1116  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1117  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1118  return false;
1119  }
1120 
1121  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1122  if (NewChain.empty()) {
1123  // No vectorization possible.
1124  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1125  return false;
1126  }
1127  if (NewChain.size() == 1) {
1128  // Failed after the first instruction. Discard it and try the smaller chain.
1129  InstructionsProcessed->insert(NewChain.front());
1130  return false;
1131  }
1132 
1133  // Update Chain to the valid vectorizable subchain.
1134  Chain = NewChain;
1135  ChainSize = Chain.size();
1136 
1137  // Check if it's legal to vectorize this chain. If not, split the chain and
1138  // try again.
1139  unsigned EltSzInBytes = Sz / 8;
1140  unsigned SzInBytes = EltSzInBytes * ChainSize;
1141  VectorType *VecTy;
1142  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1143  if (VecLoadTy)
1144  VecTy = VectorType::get(LoadTy->getScalarType(),
1145  Chain.size() * VecLoadTy->getNumElements());
1146  else
1147  VecTy = VectorType::get(LoadTy, Chain.size());
1148 
1149  // If it's more than the max vector size or the target has a better
1150  // vector factor, break it into two pieces.
1151  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1152  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1153  LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1154  " Creating two separate arrays.\n");
1155  return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1156  vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1157  }
1158 
1159  // We won't try again to vectorize the elements of the chain, regardless of
1160  // whether we succeed below.
1161  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1162 
1163  // If the load is going to be misaligned, don't vectorize it.
1164  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1165  if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1166  auto Chains = splitOddVectorElts(Chain, Sz);
1167  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1168  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1169  }
1170 
1171  Alignment = getOrEnforceKnownAlignment(
1172  L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
1173  }
1174 
1175  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1176  auto Chains = splitOddVectorElts(Chain, Sz);
1177  return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1178  vectorizeLoadChain(Chains.second, InstructionsProcessed);
1179  }
1180 
1181  LLVM_DEBUG({
1182  dbgs() << "LSV: Loads to vectorize:\n";
1183  for (Instruction *I : Chain)
1184  I->dump();
1185  });
1186 
1187  // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1188  // Last may have changed since then, but the value of First won't have. If it
1189  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1190  BasicBlock::iterator First, Last;
1191  std::tie(First, Last) = getBoundaryInstrs(Chain);
1192  Builder.SetInsertPoint(&*First);
1193 
1194  Value *Bitcast =
1195  Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1196  LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
1197  propagateMetadata(LI, Chain);
1198 
1199  if (VecLoadTy) {
1200  SmallVector<Instruction *, 16> InstrsToErase;
1201 
1202  unsigned VecWidth = VecLoadTy->getNumElements();
1203  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1204  for (auto Use : Chain[I]->users()) {
1205  // All users of vector loads are ExtractElement instructions with
1206  // constant indices, otherwise we would have bailed before now.
1207  Instruction *UI = cast<Instruction>(Use);
1208  unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1209  unsigned NewIdx = Idx + I * VecWidth;
1210  Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1211  UI->getName());
1212  if (V->getType() != UI->getType())
1213  V = Builder.CreateBitCast(V, UI->getType());
1214 
1215  // Replace the old instruction.
1216  UI->replaceAllUsesWith(V);
1217  InstrsToErase.push_back(UI);
1218  }
1219  }
1220 
1221  // Bitcast might not be an Instruction, if the value being loaded is a
1222  // constant. In that case, no need to reorder anything.
1223  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1224  reorder(BitcastInst);
1225 
1226  for (auto I : InstrsToErase)
1227  I->eraseFromParent();
1228  } else {
1229  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1230  Value *CV = Chain[I];
1231  Value *V =
1232  Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1233  if (V->getType() != CV->getType()) {
1234  V = Builder.CreateBitOrPointerCast(V, CV->getType());
1235  }
1236 
1237  // Replace the old instruction.
1238  CV->replaceAllUsesWith(V);
1239  }
1240 
1241  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1242  reorder(BitcastInst);
1243  }
1244 
1245  eraseInstructions(Chain);
1246 
1247  ++NumVectorInstructions;
1248  NumScalarsVectorized += Chain.size();
1249  return true;
1250 }
1251 
1252 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1253  unsigned Alignment) {
1254  if (Alignment % SzInBytes == 0)
1255  return false;
1256 
1257  bool Fast = false;
1258  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1259  SzInBytes * 8, AddressSpace,
1260  Alignment, &Fast);
1261  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1262  << " and fast? " << Fast << "\n";);
1263  return !Allows || !Fast;
1264 }
Legacy wrapper pass to provide the GlobalsAAResult object.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:80
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:151
uint64_t CallInst * C
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Value * getValueOperand()
Definition: Instructions.h:417
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:112
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1181
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:776
This class represents lattice values for constants.
Definition: AllocatorList.h:23
iterator begin() const
Definition: ArrayRef.h:136
This provides a very simple, boring adaptor for a begin and end iterator into a range type...
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value *> VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal, MD_access_group].
The main scalar evolution driver.
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:912
APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition: APInt.cpp:1576
bool dominates(const Instruction *A, const Instruction *B)
Find out whether A dominates B, meaning whether A comes before B in BB.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
Analysis pass providing the TargetTransformInfo.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
This class implements a map that also provides access to all stored values in a deterministic order...
Definition: MapVector.h:37
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1165
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:865
STATISTIC(NumFunctions, "Total number of functions")
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:230
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:621
An instruction for reading from memory.
Definition: Instructions.h:169
Hexagon Common GEP
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:230
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1238
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1517
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4429
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
static uint32_t getAlignment(const MCSectionCOFF &Sec)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Type * getPointerElementType() const
Definition: Type.h:380
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:244
APInt zextOrSelf(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:946
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
LLVMContext & getContext() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:779
This file contains the simple types necessary to represent the attributes associated with functions a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:224
mir Rename Register Operands
uint64_t getNumElements() const
For scalable vectors, this will return the minimum number of elements in the vector.
Definition: DerivedTypes.h:398
This file implements a class to represent arbitrary precision integral constant values and operations...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:426
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1461
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:938
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:628
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:125
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:602
An instruction for storing to memory.
Definition: Instructions.h:325
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
static bool hasNoSignedWrap(BinaryOperator &I)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
Value * getOperand(unsigned i) const
Definition: User.h:169
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return &#39;this&#39;.
Definition: Type.h:307
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:883
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:363
static bool runOnFunction(Function &F, bool PostInlining)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1669
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:465
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:73
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
static const unsigned StackAdjustedAlignment
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1184
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define DEBUG_TYPE
This file contains the declarations for the subclasses of Constant, which represent the different fla...
A manager for alias analyses.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool mayThrow() const
Return true if this instruction may throw an exception.
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:442
Represent the analysis usage information of a pass.
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
constexpr double e
Definition: MathExtras.h:57
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Value * getPointerOperand()
Definition: Instructions.h:289
iterator_range< po_iterator< T > > post_order(const T &G)
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static bool hasNoUnsignedWrap(BinaryOperator &I)
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
size_t size() const
Definition: SmallVector.h:52
static wasm::ValType getType(const TargetRegisterClass *RC)
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
const unsigned MaxDepth
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:227
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:191
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static unsigned getIntrinsicID(const SDNode *N)
Legacy wrapper pass to provide the SCEVAAResult object.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:275
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:134
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
Module.h This file contains the declarations for the Module class.
AddressSpace
Definition: NVPTXBaseInfo.h:21
iterator end() const
Definition: ArrayRef.h:137
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:755
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:374
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:184
void negate()
Negate this APInt in place.
Definition: APInt.h:1499
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class to represent vector types.
Definition: DerivedTypes.h:432
Class for arbitrary precision integers.
Definition: APInt.h:69
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Definition: ArrayRef.h:178
Represents analyses that only rely on functions&#39; control flow.
Definition: PassManager.h:114
void removeFromParent()
This method unlinks &#39;this&#39; from the containing basic block, but does not delete it.
Definition: Instruction.cpp:63
Analysis pass that exposes the ScalarEvolution for a function.
iv users
Definition: IVUsers.cpp:51
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:242
This class represents an analyzed expression in the program.
static ChainID getChainID(const Value *Ptr, const DataLayout &DL)
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
This file provides utility analysis objects describing memory locations.
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:614
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:189
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1228
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
This instruction extracts a single (scalar) element from a VectorType value.
uint32_t Size
Definition: Profile.cpp:46
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:370
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:295
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned getMinSignedBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1560
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:74
static const Function * getParent(const Value *V)
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
inst_range instructions(Function *F)
Definition: InstIterator.h:133
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
This pass exposes codegen information to IR-level passes.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:122
Value * getPointerOperand()
Definition: Instructions.h:420
bool use_empty() const
Definition: Value.h:343
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:143
const BasicBlock * getParent() const
Definition: Instruction.h:66
gep_type_iterator gep_type_begin(const User *GEP)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1224
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:542