LLVM 20.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
191 for (const ChainElem &E : C)
192 Values.emplace_back(E.Inst);
193 return propagateMetadata(I, Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
206
207 Worklist.emplace_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
212 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(Idx));
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(I)) {
224 InstructionsToMove.insert(IM);
225 Worklist.emplace_back(IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(IM))
234 continue;
235 IM->moveBefore(I);
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 // We could erase instrs right after vectorizing them, but that can mess up
250 // our BB iterators, and also can make the equivalence class keys point to
251 // freed memory. This is fixable, but it's simpler just to wait until we're
252 // done with the BB and erase all at once.
254
255public:
256 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
258 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
259 DL(F.getDataLayout()), Builder(SE.getContext()) {}
260
261 bool run();
262
263private:
264 static const unsigned MaxDepth = 3;
265
266 /// Runs the vectorizer on a "pseudo basic block", which is a range of
267 /// instructions [Begin, End) within one BB all of which have
268 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
269 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
270
271 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
272 /// in the same BB with the same value for getUnderlyingObject() etc.
273 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
275
276 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
277 /// where all instructions access a known, constant offset from the first
278 /// instruction.
279 bool runOnChain(Chain &C);
280
281 /// Splits the chain into subchains of instructions which read/write a
282 /// contiguous block of memory. Discards any length-1 subchains (because
283 /// there's nothing to vectorize in there).
284 std::vector<Chain> splitChainByContiguity(Chain &C);
285
286 /// Splits the chain into subchains where it's safe to hoist loads up to the
287 /// beginning of the sub-chain and it's safe to sink loads up to the end of
288 /// the sub-chain. Discards any length-1 subchains.
289 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
290
291 /// Splits the chain into subchains that make legal, aligned accesses.
292 /// Discards any length-1 subchains.
293 std::vector<Chain> splitChainByAlignment(Chain &C);
294
295 /// Converts the instrs in the chain into a single vectorized load or store.
296 /// Adds the old scalar loads/stores to ToErase.
297 bool vectorizeChain(Chain &C);
298
299 /// Tries to compute the offset in bytes PtrB - PtrA.
300 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
301 Instruction *ContextInst,
302 unsigned Depth = 0);
303 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
304 Instruction *ContextInst,
305 unsigned Depth);
306 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth);
309
310 /// Gets the element type of the vector that the chain will load or store.
311 /// This is nontrivial because the chain may contain elements of different
312 /// types; e.g. it's legal to have a chain that contains both i32 and float.
313 Type *getChainElemTy(const Chain &C);
314
315 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
316 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
317 /// instructions.
318 ///
319 /// The map ChainElemOffsets must contain all of the elements in
320 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321 /// address. It's ok if it contains additional entries.
322 template <bool IsLoadChain>
323 bool isSafeToMove(
324 Instruction *ChainElem, Instruction *ChainBegin,
326
327 /// Collects loads and stores grouped by "equivalence class", where:
328 /// - all elements in an eq class are a load or all are a store,
329 /// - they all load/store the same element size (it's OK to have e.g. i8 and
330 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
331 /// - they all have the same value for getUnderlyingObject().
332 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
334
335 /// Partitions Instrs into "chains" where every instruction has a known
336 /// constant offset from the first instr in the chain.
337 ///
338 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
339 /// in the chain is the leader, and an instr touches distance 0 from itself.
340 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
341};
342
343class LoadStoreVectorizerLegacyPass : public FunctionPass {
344public:
345 static char ID;
346
347 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
350 }
351
352 bool runOnFunction(Function &F) override;
353
354 StringRef getPassName() const override {
355 return "GPU Load and Store Vectorizer";
356 }
357
358 void getAnalysisUsage(AnalysisUsage &AU) const override {
364 AU.setPreservesCFG();
365 }
366};
367
368} // end anonymous namespace
369
370char LoadStoreVectorizerLegacyPass::ID = 0;
371
372INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
373 "Vectorize load and Store instructions", false, false)
380INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
381 "Vectorize load and store instructions", false, false)
382
384 return new LoadStoreVectorizerLegacyPass();
385}
386
387bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
388 // Don't vectorize when the attribute NoImplicitFloat is used.
389 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
390 return false;
391
392 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
393 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
394 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
396 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
397
398 AssumptionCache &AC =
399 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
400
401 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
402}
403
406 // Don't vectorize when the attribute NoImplicitFloat is used.
407 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
408 return PreservedAnalyses::all();
409
415
416 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
419 return Changed ? PA : PreservedAnalyses::all();
420}
421
422bool Vectorizer::run() {
423 bool Changed = false;
424 // Break up the BB if there are any instrs which aren't guaranteed to transfer
425 // execution to their successor.
426 //
427 // Consider, for example:
428 //
429 // def assert_arr_len(int n) { if (n < 2) exit(); }
430 //
431 // load arr[0]
432 // call assert_array_len(arr.length)
433 // load arr[1]
434 //
435 // Even though assert_arr_len does not read or write any memory, we can't
436 // speculate the second load before the call. More info at
437 // https://github.com/llvm/llvm-project/issues/52950.
438 for (BasicBlock *BB : post_order(&F)) {
439 // BB must at least have a terminator.
440 assert(!BB->empty());
441
443 Barriers.emplace_back(BB->begin());
444 for (Instruction &I : *BB)
446 Barriers.emplace_back(I.getIterator());
447 Barriers.emplace_back(BB->end());
448
449 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
450 ++It)
451 Changed |= runOnPseudoBB(*It, *std::next(It));
452
453 for (Instruction *I : ToErase) {
454 auto *PtrOperand = getLoadStorePointerOperand(I);
455 if (I->use_empty())
456 I->eraseFromParent();
458 }
459 ToErase.clear();
460 }
461
462 return Changed;
463}
464
465bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
467 LLVM_DEBUG({
468 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
469 if (End != Begin->getParent()->end())
470 dbgs() << *End;
471 else
472 dbgs() << "<BB end>";
473 dbgs() << ")\n";
474 });
475
476 bool Changed = false;
477 for (const auto &[EqClassKey, EqClass] :
478 collectEquivalenceClasses(Begin, End))
479 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
480
481 return Changed;
482}
483
484bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
485 ArrayRef<Instruction *> EqClass) {
486 bool Changed = false;
487
488 LLVM_DEBUG({
489 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
490 << " keyed on " << EqClassKey << ":\n";
491 for (Instruction *I : EqClass)
492 dbgs() << " " << *I << "\n";
493 });
494
495 std::vector<Chain> Chains = gatherChains(EqClass);
496 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
497 << " nontrivial chains.\n";);
498 for (Chain &C : Chains)
499 Changed |= runOnChain(C);
500 return Changed;
501}
502
503bool Vectorizer::runOnChain(Chain &C) {
504 LLVM_DEBUG({
505 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
506 dumpChain(C);
507 });
508
509 // Split up the chain into increasingly smaller chains, until we can finally
510 // vectorize the chains.
511 //
512 // (Don't be scared by the depth of the loop nest here. These operations are
513 // all at worst O(n lg n) in the number of instructions, and splitting chains
514 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
515 bool Changed = false;
516 for (auto &C : splitChainByMayAliasInstrs(C))
517 for (auto &C : splitChainByContiguity(C))
518 for (auto &C : splitChainByAlignment(C))
519 Changed |= vectorizeChain(C);
520 return Changed;
521}
522
523std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
524 if (C.empty())
525 return {};
526
527 sortChainInBBOrder(C);
528
529 LLVM_DEBUG({
530 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
531 dumpChain(C);
532 });
533
534 // We know that elements in the chain with nonverlapping offsets can't
535 // alias, but AA may not be smart enough to figure this out. Use a
536 // hashtable.
537 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
538 for (const auto &E : C)
539 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
540
541 // Loads get hoisted up to the first load in the chain. Stores get sunk
542 // down to the last store in the chain. Our algorithm for loads is:
543 //
544 // - Take the first element of the chain. This is the start of a new chain.
545 // - Take the next element of `Chain` and check for may-alias instructions
546 // up to the start of NewChain. If no may-alias instrs, add it to
547 // NewChain. Otherwise, start a new NewChain.
548 //
549 // For stores it's the same except in the reverse direction.
550 //
551 // We expect IsLoad to be an std::bool_constant.
552 auto Impl = [&](auto IsLoad) {
553 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
554 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
555 if constexpr (IsLoad())
556 return std::make_pair(C.begin(), C.end());
557 else
558 return std::make_pair(C.rbegin(), C.rend());
559 }(IsLoad);
560 assert(ChainBegin != ChainEnd);
561
562 std::vector<Chain> Chains;
564 NewChain.emplace_back(*ChainBegin);
565 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
566 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
567 ChainOffsets)) {
568 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
569 << *ChainIt->Inst << " into " << *ChainBegin->Inst
570 << "\n");
571 NewChain.emplace_back(*ChainIt);
572 } else {
574 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
575 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
576 if (NewChain.size() > 1) {
577 LLVM_DEBUG({
578 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
579 dumpChain(NewChain);
580 });
581 Chains.emplace_back(std::move(NewChain));
582 }
583
584 // Start a new chain.
585 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
586 }
587 }
588 if (NewChain.size() > 1) {
589 LLVM_DEBUG({
590 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
591 dumpChain(NewChain);
592 });
593 Chains.emplace_back(std::move(NewChain));
594 }
595 return Chains;
596 };
597
598 if (isa<LoadInst>(C[0].Inst))
599 return Impl(/*IsLoad=*/std::bool_constant<true>());
600
601 assert(isa<StoreInst>(C[0].Inst));
602 return Impl(/*IsLoad=*/std::bool_constant<false>());
603}
604
605std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
606 if (C.empty())
607 return {};
608
609 sortChainInOffsetOrder(C);
610
611 LLVM_DEBUG({
612 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
613 dumpChain(C);
614 });
615
616 std::vector<Chain> Ret;
617 Ret.push_back({C.front()});
618
619 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
620 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
621 auto &CurChain = Ret.back();
622 const ChainElem &Prev = CurChain.back();
623 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
624 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
625 "collectEquivalenceClass");
626 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
627
628 // Add this instruction to the end of the current chain, or start a new one.
629 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
630 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
631 << (AreContiguous ? "" : "not ") << "contiguous: "
632 << *Prev.Inst << " (ends at offset " << PrevReadEnd
633 << ") -> " << *It->Inst << " (starts at offset "
634 << It->OffsetFromLeader << ")\n");
635 if (AreContiguous)
636 CurChain.push_back(*It);
637 else
638 Ret.push_back({*It});
639 }
640
641 // Filter out length-1 chains, these are uninteresting.
642 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
643 return Ret;
644}
645
646Type *Vectorizer::getChainElemTy(const Chain &C) {
647 assert(!C.empty());
648 // The rules are:
649 // - If there are any pointer types in the chain, use an integer type.
650 // - Prefer an integer type if it appears in the chain.
651 // - Otherwise, use the first type in the chain.
652 //
653 // The rule about pointer types is a simplification when we merge e.g. a load
654 // of a ptr and a double. There's no direct conversion from a ptr to a
655 // double; it requires a ptrtoint followed by a bitcast.
656 //
657 // It's unclear to me if the other rules have any practical effect, but we do
658 // it to match this pass's previous behavior.
659 if (any_of(C, [](const ChainElem &E) {
660 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
661 })) {
662 return Type::getIntNTy(
663 F.getContext(),
664 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
665 }
666
667 for (const ChainElem &E : C)
668 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
669 return T;
670 return getLoadStoreType(C[0].Inst)->getScalarType();
671}
672
673std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
674 // We use a simple greedy algorithm.
675 // - Given a chain of length N, find all prefixes that
676 // (a) are not longer than the max register length, and
677 // (b) are a power of 2.
678 // - Starting from the longest prefix, try to create a vector of that length.
679 // - If one of them works, great. Repeat the algorithm on any remaining
680 // elements in the chain.
681 // - If none of them work, discard the first element and repeat on a chain
682 // of length N-1.
683 if (C.empty())
684 return {};
685
686 sortChainInOffsetOrder(C);
687
688 LLVM_DEBUG({
689 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
690 dumpChain(C);
691 });
692
693 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
694 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
695 unsigned ChainSizeBytes, VectorType *VecTy) {
696 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
697 ChainSizeBytes, VecTy)
698 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
699 ChainSizeBytes, VecTy);
700 };
701
702#ifndef NDEBUG
703 for (const auto &E : C) {
704 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
705 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
706 "Should have filtered out non-power-of-two elements in "
707 "collectEquivalenceClasses.");
708 }
709#endif
710
711 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
712 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
713
714 std::vector<Chain> Ret;
715 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
716 // Find candidate chains of size not greater than the largest vector reg.
717 // These chains are over the closed interval [CBegin, CEnd].
718 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
719 CandidateChains;
720 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
721 APInt Sz = C[CEnd].OffsetFromLeader +
722 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
723 C[CBegin].OffsetFromLeader;
724 if (Sz.sgt(VecRegBytes))
725 break;
726 CandidateChains.emplace_back(CEnd,
727 static_cast<unsigned>(Sz.getLimitedValue()));
728 }
729
730 // Consider the longest chain first.
731 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
732 It != End; ++It) {
733 auto [CEnd, SizeBytes] = *It;
735 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
736 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
737
738 Type *VecElemTy = getChainElemTy(C);
739 // Note, VecElemTy is a power of 2, but might be less than one byte. For
740 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
741 // VecElemTy would be i4.
742 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
743
744 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
745 assert((8 * SizeBytes) % VecElemBits == 0);
746 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
747 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
748 unsigned VF = 8 * VecRegBytes / VecElemBits;
749
750 // Check that TTI is happy with this vectorization factor.
751 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
752 VecElemBits * NumVecElems / 8, VecTy);
753 if (TargetVF != VF && TargetVF < NumVecElems) {
755 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
756 "because TargetVF="
757 << TargetVF << " != VF=" << VF
758 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
759 continue;
760 }
761
762 // Is a load/store with this alignment allowed by TTI and at least as fast
763 // as an unvectorized load/store?
764 //
765 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
766 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
767 &F = F](Align Alignment) {
768 if (Alignment.value() % SizeBytes == 0)
769 return true;
770 unsigned VectorizedSpeed = 0;
771 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
772 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
773 if (!AllowsMisaligned) {
775 << "LSV: Access of " << SizeBytes << "B in addrspace "
776 << AS << " with alignment " << Alignment.value()
777 << " is misaligned, and therefore can't be vectorized.\n");
778 return false;
779 }
780
781 unsigned ElementwiseSpeed = 0;
782 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
783 Alignment, &ElementwiseSpeed);
784 if (VectorizedSpeed < ElementwiseSpeed) {
786 << "LSV: Access of " << SizeBytes << "B in addrspace "
787 << AS << " with alignment " << Alignment.value()
788 << " has relative speed " << VectorizedSpeed
789 << ", which is lower than the elementwise speed of "
790 << ElementwiseSpeed
791 << ". Therefore this access won't be vectorized.\n");
792 return false;
793 }
794 return true;
795 };
796
797 // If we're loading/storing from an alloca, align it if possible.
798 //
799 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
800 // tells us this is beneficial. This feels a bit odd, but it matches
801 // existing tests. This isn't *so* bad, because at most we align to 4
802 // bytes (current value of StackAdjustedAlignment).
803 //
804 // FIXME: We will upgrade the alignment of the alloca even if it turns out
805 // we can't vectorize for some other reason.
806 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
807 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
808 isa<AllocaInst>(PtrOperand->stripPointerCasts());
809 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
810 Align PrefAlign = Align(StackAdjustedAlignment);
811 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
812 IsAllowedAndFast(PrefAlign)) {
814 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
815 if (NewAlign >= Alignment) {
817 << "LSV: splitByChain upgrading alloca alignment from "
818 << Alignment.value() << " to " << NewAlign.value()
819 << "\n");
820 Alignment = NewAlign;
821 }
822 }
823
824 if (!IsAllowedAndFast(Alignment)) {
826 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
827 "because its alignment is not AllowedAndFast: "
828 << Alignment.value() << "\n");
829 continue;
830 }
831
832 if ((IsLoadChain &&
833 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
834 (!IsLoadChain &&
835 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
837 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
838 "because !isLegalToVectorizeLoad/StoreChain.");
839 continue;
840 }
841
842 // Hooray, we can vectorize this chain!
843 Chain &NewChain = Ret.emplace_back();
844 for (unsigned I = CBegin; I <= CEnd; ++I)
845 NewChain.emplace_back(C[I]);
846 CBegin = CEnd; // Skip over the instructions we've added to the chain.
847 break;
848 }
849 }
850 return Ret;
851}
852
853bool Vectorizer::vectorizeChain(Chain &C) {
854 if (C.size() < 2)
855 return false;
856
857 sortChainInOffsetOrder(C);
858
859 LLVM_DEBUG({
860 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
861 dumpChain(C);
862 });
863
864 Type *VecElemTy = getChainElemTy(C);
865 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
866 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
867 unsigned ChainBytes = std::accumulate(
868 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
869 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
870 });
871 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
872 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
873 // than 1 byte (e.g. VecTy == <32 x i1>).
874 Type *VecTy = FixedVectorType::get(
875 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
876
877 Align Alignment = getLoadStoreAlignment(C[0].Inst);
878 // If this is a load/store of an alloca, we might have upgraded the alloca's
879 // alignment earlier. Get the new alignment.
880 if (AS == DL.getAllocaAddrSpace()) {
881 Alignment = std::max(
882 Alignment,
884 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
885 }
886
887 // All elements of the chain must have the same scalar-type size.
888#ifndef NDEBUG
889 for (const ChainElem &E : C)
890 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
891 DL.getTypeStoreSize(VecElemTy));
892#endif
893
894 Instruction *VecInst;
895 if (IsLoadChain) {
896 // Loads get hoisted to the location of the first load in the chain. We may
897 // also need to hoist the (transitive) operands of the loads.
898 Builder.SetInsertPoint(
899 llvm::min_element(C, [](const auto &A, const auto &B) {
900 return A.Inst->comesBefore(B.Inst);
901 })->Inst);
902
903 // Chain is in offset order, so C[0] is the instr with the lowest offset,
904 // i.e. the root of the vector.
905 VecInst = Builder.CreateAlignedLoad(VecTy,
907 Alignment);
908
909 unsigned VecIdx = 0;
910 for (const ChainElem &E : C) {
911 Instruction *I = E.Inst;
912 Value *V;
914 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
915 auto Mask = llvm::to_vector<8>(
916 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
917 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
918 VecIdx += VT->getNumElements();
919 } else {
920 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
921 I->getName());
922 ++VecIdx;
923 }
924 if (V->getType() != I->getType())
925 V = Builder.CreateBitOrPointerCast(V, I->getType());
926 I->replaceAllUsesWith(V);
927 }
928
929 // Finally, we need to reorder the instrs in the BB so that the (transitive)
930 // operands of VecInst appear before it. To see why, suppose we have
931 // vectorized the following code:
932 //
933 // ptr1 = gep a, 1
934 // load1 = load i32 ptr1
935 // ptr0 = gep a, 0
936 // load0 = load i32 ptr0
937 //
938 // We will put the vectorized load at the location of the earliest load in
939 // the BB, i.e. load1. We get:
940 //
941 // ptr1 = gep a, 1
942 // loadv = load <2 x i32> ptr0
943 // load0 = extractelement loadv, 0
944 // load1 = extractelement loadv, 1
945 // ptr0 = gep a, 0
946 //
947 // Notice that loadv uses ptr0, which is defined *after* it!
948 reorder(VecInst);
949 } else {
950 // Stores get sunk to the location of the last store in the chain.
951 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
952 return A.Inst->comesBefore(B.Inst);
953 })->Inst);
954
955 // Build the vector to store.
956 Value *Vec = PoisonValue::get(VecTy);
957 unsigned VecIdx = 0;
958 auto InsertElem = [&](Value *V) {
959 if (V->getType() != VecElemTy)
960 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
961 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
962 };
963 for (const ChainElem &E : C) {
964 auto *I = cast<StoreInst>(E.Inst);
965 if (FixedVectorType *VT =
966 dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
967 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
968 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
969 Builder.getInt32(J)));
970 }
971 } else {
972 InsertElem(I->getValueOperand());
973 }
974 }
975
976 // Chain is in offset order, so C[0] is the instr with the lowest offset,
977 // i.e. the root of the vector.
978 VecInst = Builder.CreateAlignedStore(
979 Vec,
981 Alignment);
982 }
983
984 propagateMetadata(VecInst, C);
985
986 for (const ChainElem &E : C)
987 ToErase.emplace_back(E.Inst);
988
989 ++NumVectorInstructions;
990 NumScalarsVectorized += C.size();
991 return true;
992}
993
994template <bool IsLoadChain>
995bool Vectorizer::isSafeToMove(
996 Instruction *ChainElem, Instruction *ChainBegin,
998 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
999 << *ChainBegin << ")\n");
1000
1001 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1002 if (ChainElem == ChainBegin)
1003 return true;
1004
1005 // Invariant loads can always be reordered; by definition they are not
1006 // clobbered by stores.
1007 if (isInvariantLoad(ChainElem))
1008 return true;
1009
1010 auto BBIt = std::next([&] {
1011 if constexpr (IsLoadChain)
1012 return BasicBlock::reverse_iterator(ChainElem);
1013 else
1014 return BasicBlock::iterator(ChainElem);
1015 }());
1016 auto BBItEnd = std::next([&] {
1017 if constexpr (IsLoadChain)
1018 return BasicBlock::reverse_iterator(ChainBegin);
1019 else
1020 return BasicBlock::iterator(ChainBegin);
1021 }());
1022
1023 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1024 const unsigned ChainElemSize =
1025 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1026
1027 for (; BBIt != BBItEnd; ++BBIt) {
1028 Instruction *I = &*BBIt;
1029
1030 if (!I->mayReadOrWriteMemory())
1031 continue;
1032
1033 // Loads can be reordered with other loads.
1034 if (IsLoadChain && isa<LoadInst>(I))
1035 continue;
1036
1037 // Stores can be sunk below invariant loads.
1038 if (!IsLoadChain && isInvariantLoad(I))
1039 continue;
1040
1041 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1042 // what offset ChainIt accesses. This may be better than AA is able to do.
1043 //
1044 // We should really only have duplicate offsets for stores (the duplicate
1045 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1046 // split the chain so we don't have to handle this case specially.
1047 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1048 // I and ChainElem overlap if:
1049 // - I and ChainElem have the same offset, OR
1050 // - I's offset is less than ChainElem's, but I touches past the
1051 // beginning of ChainElem, OR
1052 // - ChainElem's offset is less than I's, but ChainElem touches past the
1053 // beginning of I.
1054 const APInt &IOffset = OffsetIt->second;
1055 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1056 if (IOffset == ChainElemOffset ||
1057 (IOffset.sle(ChainElemOffset) &&
1058 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1059 (ChainElemOffset.sle(IOffset) &&
1060 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1061 LLVM_DEBUG({
1062 // Double check that AA also sees this alias. If not, we probably
1063 // have a bug.
1064 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1065 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1066 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1067 });
1068 return false; // We found an aliasing instruction; bail.
1069 }
1070
1071 continue; // We're confident there's no alias.
1072 }
1073
1074 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1075 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1076 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1077 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1078 << " Aliasing instruction:\n"
1079 << " " << *I << '\n'
1080 << " Aliased instruction and pointer:\n"
1081 << " " << *ChainElem << '\n'
1082 << " " << *getLoadStorePointerOperand(ChainElem)
1083 << '\n');
1084
1085 return false;
1086 }
1087 }
1088 return true;
1089}
1090
1092 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1093 return (Signed && BinOpI->hasNoSignedWrap()) ||
1094 (!Signed && BinOpI->hasNoUnsignedWrap());
1095}
1096
1097static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1098 unsigned MatchingOpIdxA, Instruction *AddOpB,
1099 unsigned MatchingOpIdxB, bool Signed) {
1100 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1101 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1102 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1103 << ", MatchingOpIdxB=" << MatchingOpIdxB
1104 << ", Signed=" << Signed << "\n");
1105 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1106 // being the same, we can guarantee that the transformation is safe if we can
1107 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1108 // For example:
1109 // %tmp7 = add nsw i32 %tmp2, %v0
1110 // %tmp8 = sext i32 %tmp7 to i64
1111 // ...
1112 // %tmp11 = add nsw i32 %v0, 1
1113 // %tmp12 = add nsw i32 %tmp2, %tmp11
1114 // %tmp13 = sext i32 %tmp12 to i64
1115 //
1116 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1117 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1118 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1119 assert(AddOpA->getOpcode() == Instruction::Add &&
1120 AddOpB->getOpcode() == Instruction::Add &&
1121 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1122 if (AddOpA->getOperand(MatchingOpIdxA) ==
1123 AddOpB->getOperand(MatchingOpIdxB)) {
1124 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1125 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1126 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1127 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1128 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1129 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1130 checkNoWrapFlags(OtherInstrB, Signed) &&
1131 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1132 int64_t CstVal =
1133 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1134 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1135 IdxDiff.getSExtValue() == CstVal)
1136 return true;
1137 }
1138 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1139 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1140 checkNoWrapFlags(OtherInstrA, Signed) &&
1141 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1142 int64_t CstVal =
1143 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1144 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1145 IdxDiff.getSExtValue() == -CstVal)
1146 return true;
1147 }
1148 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1149 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1150 if (OtherInstrA && OtherInstrB &&
1151 OtherInstrA->getOpcode() == Instruction::Add &&
1152 OtherInstrB->getOpcode() == Instruction::Add &&
1153 checkNoWrapFlags(OtherInstrA, Signed) &&
1154 checkNoWrapFlags(OtherInstrB, Signed) &&
1155 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1156 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1157 int64_t CstValA =
1158 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1159 int64_t CstValB =
1160 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1161 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1162 IdxDiff.getSExtValue() == (CstValB - CstValA))
1163 return true;
1164 }
1165 }
1166 return false;
1167}
1168
1169std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1170 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1171 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1172 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1173 << " Depth=" << Depth << "\n");
1174 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1175 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1176 if (!GEPA || !GEPB)
1177 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1178
1179 // Look through GEPs after checking they're the same except for the last
1180 // index.
1181 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1182 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1183 return std::nullopt;
1184 gep_type_iterator GTIA = gep_type_begin(GEPA);
1185 gep_type_iterator GTIB = gep_type_begin(GEPB);
1186 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1187 if (GTIA.getOperand() != GTIB.getOperand())
1188 return std::nullopt;
1189 ++GTIA;
1190 ++GTIB;
1191 }
1192
1193 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1194 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1195 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1196 OpA->getType() != OpB->getType())
1197 return std::nullopt;
1198
1199 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1200
1201 // Only look through a ZExt/SExt.
1202 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1203 return std::nullopt;
1204
1205 bool Signed = isa<SExtInst>(OpA);
1206
1207 // At this point A could be a function parameter, i.e. not an instruction
1208 Value *ValA = OpA->getOperand(0);
1209 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1210 if (!OpB || ValA->getType() != OpB->getType())
1211 return std::nullopt;
1212
1213 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1214 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1215 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1216 if (IdxDiffSCEV == SE.getCouldNotCompute())
1217 return std::nullopt;
1218
1219 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1220 if (!IdxDiffRange.isSingleElement())
1221 return std::nullopt;
1222 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1223
1224 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1225 << "\n");
1226
1227 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1228 bool Safe = false;
1229
1230 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1231 // ValA, we're okay.
1232 if (OpB->getOpcode() == Instruction::Add &&
1233 isa<ConstantInt>(OpB->getOperand(1)) &&
1234 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1236 Safe = true;
1237
1238 // Second attempt: check if we have eligible add NSW/NUW instruction
1239 // sequences.
1240 OpA = dyn_cast<Instruction>(ValA);
1241 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1242 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1243 checkNoWrapFlags(OpB, Signed)) {
1244 // In the checks below a matching operand in OpA and OpB is an operand which
1245 // is the same in those two instructions. Below we account for possible
1246 // orders of the operands of these add instructions.
1247 for (unsigned MatchingOpIdxA : {0, 1})
1248 for (unsigned MatchingOpIdxB : {0, 1})
1249 if (!Safe)
1250 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1251 MatchingOpIdxB, Signed);
1252 }
1253
1254 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1255
1256 // Third attempt:
1257 //
1258 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1259 // order bit other than the sign bit are known to be zero in ValA, we can add
1260 // Diff to it while guaranteeing no overflow of any sort.
1261 //
1262 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1263 if (!Safe) {
1264 // When computing known bits, use the GEPs as context instructions, since
1265 // they likely are in the same BB as the load/store.
1266 KnownBits Known(BitWidth);
1267 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1268 ContextInst, &DT);
1269 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1270 if (Signed)
1271 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1272 if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1273 return std::nullopt;
1274 Safe = true;
1275 }
1276
1277 if (Safe)
1278 return IdxDiff * Stride;
1279 return std::nullopt;
1280}
1281
1282std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1283 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1284 if (Depth++ == MaxDepth)
1285 return std::nullopt;
1286
1287 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1288 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1289 if (SelectA->getCondition() != SelectB->getCondition())
1290 return std::nullopt;
1291 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1292 << ", PtrB=" << *PtrB << ", ContextInst="
1293 << *ContextInst << ", Depth=" << Depth << "\n");
1294 std::optional<APInt> TrueDiff = getConstantOffset(
1295 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1296 if (!TrueDiff)
1297 return std::nullopt;
1298 std::optional<APInt> FalseDiff =
1299 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1300 ContextInst, Depth);
1301 if (TrueDiff == FalseDiff)
1302 return TrueDiff;
1303 }
1304 }
1305 return std::nullopt;
1306}
1307
1308EquivalenceClassMap
1309Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1311 EquivalenceClassMap Ret;
1312
1313 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1314 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1315 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1316 // The select's themselves are distinct instructions even if they share
1317 // the same condition and evaluate to consecutive pointers for true and
1318 // false values of the condition. Therefore using the select's themselves
1319 // for grouping instructions would put consecutive accesses into different
1320 // lists and they won't be even checked for being consecutive, and won't
1321 // be vectorized.
1322 return Sel->getCondition();
1323 }
1324 return ObjPtr;
1325 };
1326
1327 for (Instruction &I : make_range(Begin, End)) {
1328 auto *LI = dyn_cast<LoadInst>(&I);
1329 auto *SI = dyn_cast<StoreInst>(&I);
1330 if (!LI && !SI)
1331 continue;
1332
1333 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1334 continue;
1335
1336 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1337 (SI && !TTI.isLegalToVectorizeStore(SI)))
1338 continue;
1339
1340 Type *Ty = getLoadStoreType(&I);
1342 continue;
1343
1344 // Skip weird non-byte sizes. They probably aren't worth the effort of
1345 // handling correctly.
1346 unsigned TySize = DL.getTypeSizeInBits(Ty);
1347 if ((TySize % 8) != 0)
1348 continue;
1349
1350 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1351 // functions are currently using an integer type for the vectorized
1352 // load/store, and does not support casting between the integer type and a
1353 // vector of pointers (e.g. i64 to <2 x i16*>)
1354 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1355 continue;
1356
1358 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1359 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1360
1361 unsigned VF = VecRegSize / TySize;
1362 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1363
1364 // Only handle power-of-two sized elements.
1365 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1366 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1367 continue;
1368
1369 // No point in looking at these if they're too big to vectorize.
1370 if (TySize > VecRegSize / 2 ||
1371 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1372 continue;
1373
1374 Ret[{GetUnderlyingObject(Ptr), AS,
1375 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1376 /*IsLoad=*/LI != nullptr}]
1377 .emplace_back(&I);
1378 }
1379
1380 return Ret;
1381}
1382
1383std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1384 if (Instrs.empty())
1385 return {};
1386
1387 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1388 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1389
1390#ifndef NDEBUG
1391 // Check that Instrs is in BB order and all have the same addr space.
1392 for (size_t I = 1; I < Instrs.size(); ++I) {
1393 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1394 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1395 }
1396#endif
1397
1398 // Machinery to build an MRU-hashtable of Chains.
1399 //
1400 // (Ideally this could be done with MapVector, but as currently implemented,
1401 // moving an element to the front of a MapVector is O(n).)
1402 struct InstrListElem : ilist_node<InstrListElem>,
1403 std::pair<Instruction *, Chain> {
1404 explicit InstrListElem(Instruction *I)
1405 : std::pair<Instruction *, Chain>(I, {}) {}
1406 };
1407 struct InstrListElemDenseMapInfo {
1408 using PtrInfo = DenseMapInfo<InstrListElem *>;
1409 using IInfo = DenseMapInfo<Instruction *>;
1410 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1411 static InstrListElem *getTombstoneKey() {
1412 return PtrInfo::getTombstoneKey();
1413 }
1414 static unsigned getHashValue(const InstrListElem *E) {
1415 return IInfo::getHashValue(E->first);
1416 }
1417 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1418 if (A == getEmptyKey() || B == getEmptyKey())
1419 return A == getEmptyKey() && B == getEmptyKey();
1420 if (A == getTombstoneKey() || B == getTombstoneKey())
1421 return A == getTombstoneKey() && B == getTombstoneKey();
1422 return IInfo::isEqual(A->first, B->first);
1423 }
1424 };
1428
1429 // Compare each instruction in `instrs` to leader of the N most recently-used
1430 // chains. This limits the O(n^2) behavior of this pass while also allowing
1431 // us to build arbitrarily long chains.
1432 for (Instruction *I : Instrs) {
1433 constexpr int MaxChainsToTry = 64;
1434
1435 bool MatchFound = false;
1436 auto ChainIter = MRU.begin();
1437 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1438 ++J, ++ChainIter) {
1439 if (std::optional<APInt> Offset = getConstantOffset(
1440 getLoadStorePointerOperand(ChainIter->first),
1442 /*ContextInst=*/
1443 (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1444 // `Offset` might not have the expected number of bits, if e.g. AS has a
1445 // different number of bits than opaque pointers.
1446 ChainIter->second.emplace_back(I, Offset.value());
1447 // Move ChainIter to the front of the MRU list.
1448 MRU.remove(*ChainIter);
1449 MRU.push_front(*ChainIter);
1450 MatchFound = true;
1451 break;
1452 }
1453 }
1454
1455 if (!MatchFound) {
1456 APInt ZeroOffset(ASPtrBits, 0);
1457 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1458 E->second.emplace_back(I, ZeroOffset);
1459 MRU.push_front(*E);
1460 Chains.insert(E);
1461 }
1462 }
1463
1464 std::vector<Chain> Ret;
1465 Ret.reserve(Chains.size());
1466 // Iterate over MRU rather than Chains so the order is deterministic.
1467 for (auto &E : MRU)
1468 if (E.second.size() > 1)
1469 Ret.emplace_back(std::move(E.second));
1470 return Ret;
1471}
1472
1473std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1474 Instruction *ContextInst,
1475 unsigned Depth) {
1476 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1477 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1478 << ", Depth=" << Depth << "\n");
1479 // We'll ultimately return a value of this bit width, even if computations
1480 // happen in a different width.
1481 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1482 APInt OffsetA(OrigBitWidth, 0);
1483 APInt OffsetB(OrigBitWidth, 0);
1484 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1485 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1486 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1487 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1488 return std::nullopt;
1489
1490 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1491 // should properly handle a possible overflow and the value should fit into
1492 // the smallest data type used in the cast/gep chain.
1493 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1494 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1495
1496 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1497 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1498 if (PtrA == PtrB)
1499 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1500
1501 // Try to compute B - A.
1502 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1503 if (DistScev != SE.getCouldNotCompute()) {
1504 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1505 ConstantRange DistRange = SE.getSignedRange(DistScev);
1506 if (DistRange.isSingleElement()) {
1507 // Handle index width (the width of Dist) != pointer width (the width of
1508 // the Offset*s at this point).
1509 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1510 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1511 }
1512 }
1513 if (std::optional<APInt> Diff =
1514 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1515 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1516 .sextOrTrunc(OrigBitWidth);
1517 return std::nullopt;
1518}
AMDGPU Mark last scratch load
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Module.h This file contains the declarations for the Module class.
static const unsigned MaxDepth
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To)
Check if it's safe to move From down to To, checking that no physical registers are clobbered.
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:78
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
APInt abs() const
Get the absolute value.
Definition: APInt.h:1773
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1166
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:475
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents a range of values.
Definition: ConstantRange.h:47
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
An instruction for reading from memory.
Definition: Instructions.h:176
bool isSimple() const
Definition: Instructions.h:247
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getCouldNotCompute()
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:389
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalToVectorizeLoad(LoadInst *LI) const
bool isLegalToVectorizeStore(StoreInst *SI) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:740
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:694
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
size_type size() const
Definition: DenseSet.h:81
TypeSize getSequentialElementStride(const DataLayout &DL) const
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A simple intrusive list implementation.
Definition: simple_ilist.h:81
void push_front(reference Node)
Insert a node at the front; never copies.
Definition: simple_ilist.h:150
void remove(reference N)
Remove a node by reference; never deletes.
Definition: simple_ilist.h:189
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2004
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< po_iterator< T > > post_order(const T &G)
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1581
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
TargetTransformInfo TTI
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2099
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117