LLVM 19.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160};
161using Chain = SmallVector<ChainElem, 1>;
162
163void sortChainInBBOrder(Chain &C) {
164 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
165}
166
167void sortChainInOffsetOrder(Chain &C) {
168 sort(C, [](const auto &A, const auto &B) {
169 if (A.OffsetFromLeader != B.OffsetFromLeader)
170 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
171 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
172 });
173}
174
175[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
176 for (const auto &E : C) {
177 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
178 }
179}
180
181using EquivalenceClassMap =
183
184// FIXME: Assuming stack alignment of 4 is always good enough
185constexpr unsigned StackAdjustedAlignment = 4;
186
189 for (const ChainElem &E : C)
190 Values.push_back(E.Inst);
191 return propagateMetadata(I, Values);
192}
193
194bool isInvariantLoad(const Instruction *I) {
195 const LoadInst *LI = dyn_cast<LoadInst>(I);
196 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
197}
198
199/// Reorders the instructions that I depends on (the instructions defining its
200/// operands), to ensure they dominate I.
201void reorder(Instruction *I) {
202 SmallPtrSet<Instruction *, 16> InstructionsToMove;
204
205 Worklist.push_back(I);
206 while (!Worklist.empty()) {
207 Instruction *IW = Worklist.pop_back_val();
208 int NumOperands = IW->getNumOperands();
209 for (int i = 0; i < NumOperands; i++) {
210 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
211 if (!IM || IM->getOpcode() == Instruction::PHI)
212 continue;
213
214 // If IM is in another BB, no need to move it, because this pass only
215 // vectorizes instructions within one BB.
216 if (IM->getParent() != I->getParent())
217 continue;
218
219 if (!IM->comesBefore(I)) {
220 InstructionsToMove.insert(IM);
221 Worklist.push_back(IM);
222 }
223 }
224 }
225
226 // All instructions to move should follow I. Start from I, not from begin().
227 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
228 Instruction *IM = &*(BBI++);
229 if (!InstructionsToMove.count(IM))
230 continue;
231 IM->moveBefore(I);
232 }
233}
234
235class Vectorizer {
236 Function &F;
237 AliasAnalysis &AA;
238 AssumptionCache &AC;
239 DominatorTree &DT;
240 ScalarEvolution &SE;
242 const DataLayout &DL;
243 IRBuilder<> Builder;
244
245 // We could erase instrs right after vectorizing them, but that can mess up
246 // our BB iterators, and also can make the equivalence class keys point to
247 // freed memory. This is fixable, but it's simpler just to wait until we're
248 // done with the BB and erase all at once.
250
251public:
252 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
254 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
255 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
256
257 bool run();
258
259private:
260 static const unsigned MaxDepth = 3;
261
262 /// Runs the vectorizer on a "pseudo basic block", which is a range of
263 /// instructions [Begin, End) within one BB all of which have
264 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
265 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
266
267 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
268 /// in the same BB with the same value for getUnderlyingObject() etc.
269 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
271
272 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
273 /// where all instructions access a known, constant offset from the first
274 /// instruction.
275 bool runOnChain(Chain &C);
276
277 /// Splits the chain into subchains of instructions which read/write a
278 /// contiguous block of memory. Discards any length-1 subchains (because
279 /// there's nothing to vectorize in there).
280 std::vector<Chain> splitChainByContiguity(Chain &C);
281
282 /// Splits the chain into subchains where it's safe to hoist loads up to the
283 /// beginning of the sub-chain and it's safe to sink loads up to the end of
284 /// the sub-chain. Discards any length-1 subchains.
285 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
286
287 /// Splits the chain into subchains that make legal, aligned accesses.
288 /// Discards any length-1 subchains.
289 std::vector<Chain> splitChainByAlignment(Chain &C);
290
291 /// Converts the instrs in the chain into a single vectorized load or store.
292 /// Adds the old scalar loads/stores to ToErase.
293 bool vectorizeChain(Chain &C);
294
295 /// Tries to compute the offset in bytes PtrB - PtrA.
296 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
297 Instruction *ContextInst,
298 unsigned Depth = 0);
299 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
300 Instruction *ContextInst,
301 unsigned Depth);
302 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
303 Instruction *ContextInst,
304 unsigned Depth);
305
306 /// Gets the element type of the vector that the chain will load or store.
307 /// This is nontrivial because the chain may contain elements of different
308 /// types; e.g. it's legal to have a chain that contains both i32 and float.
309 Type *getChainElemTy(const Chain &C);
310
311 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
312 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
313 /// instructions.
314 ///
315 /// The map ChainElemOffsets must contain all of the elements in
316 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
317 /// address. It's ok if it contains additional entries.
318 template <bool IsLoadChain>
319 bool isSafeToMove(
320 Instruction *ChainElem, Instruction *ChainBegin,
322
323 /// Collects loads and stores grouped by "equivalence class", where:
324 /// - all elements in an eq class are a load or all are a store,
325 /// - they all load/store the same element size (it's OK to have e.g. i8 and
326 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
327 /// - they all have the same value for getUnderlyingObject().
328 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
330
331 /// Partitions Instrs into "chains" where every instruction has a known
332 /// constant offset from the first instr in the chain.
333 ///
334 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
335 /// in the chain is the leader, and an instr touches distance 0 from itself.
336 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
337};
338
339class LoadStoreVectorizerLegacyPass : public FunctionPass {
340public:
341 static char ID;
342
343 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
346 }
347
348 bool runOnFunction(Function &F) override;
349
350 StringRef getPassName() const override {
351 return "GPU Load and Store Vectorizer";
352 }
353
354 void getAnalysisUsage(AnalysisUsage &AU) const override {
360 AU.setPreservesCFG();
361 }
362};
363
364} // end anonymous namespace
365
366char LoadStoreVectorizerLegacyPass::ID = 0;
367
368INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
369 "Vectorize load and Store instructions", false, false)
376INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
377 "Vectorize load and store instructions", false, false)
378
380 return new LoadStoreVectorizerLegacyPass();
381}
382
383bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
384 // Don't vectorize when the attribute NoImplicitFloat is used.
385 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
386 return false;
387
388 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
389 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
390 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
392 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
393
394 AssumptionCache &AC =
395 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
396
397 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
398}
399
402 // Don't vectorize when the attribute NoImplicitFloat is used.
403 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
404 return PreservedAnalyses::all();
405
411
412 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
415 return Changed ? PA : PreservedAnalyses::all();
416}
417
418bool Vectorizer::run() {
419 bool Changed = false;
420 // Break up the BB if there are any instrs which aren't guaranteed to transfer
421 // execution to their successor.
422 //
423 // Consider, for example:
424 //
425 // def assert_arr_len(int n) { if (n < 2) exit(); }
426 //
427 // load arr[0]
428 // call assert_array_len(arr.length)
429 // load arr[1]
430 //
431 // Even though assert_arr_len does not read or write any memory, we can't
432 // speculate the second load before the call. More info at
433 // https://github.com/llvm/llvm-project/issues/52950.
434 for (BasicBlock *BB : post_order(&F)) {
435 // BB must at least have a terminator.
436 assert(!BB->empty());
437
439 Barriers.push_back(BB->begin());
440 for (Instruction &I : *BB)
442 Barriers.push_back(I.getIterator());
443 Barriers.push_back(BB->end());
444
445 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
446 ++It)
447 Changed |= runOnPseudoBB(*It, *std::next(It));
448
449 for (Instruction *I : ToErase) {
450 auto *PtrOperand = getLoadStorePointerOperand(I);
451 if (I->use_empty())
452 I->eraseFromParent();
454 }
455 ToErase.clear();
456 }
457
458 return Changed;
459}
460
461bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
463 LLVM_DEBUG({
464 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
465 if (End != Begin->getParent()->end())
466 dbgs() << *End;
467 else
468 dbgs() << "<BB end>";
469 dbgs() << ")\n";
470 });
471
472 bool Changed = false;
473 for (const auto &[EqClassKey, EqClass] :
474 collectEquivalenceClasses(Begin, End))
475 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
476
477 return Changed;
478}
479
480bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
481 ArrayRef<Instruction *> EqClass) {
482 bool Changed = false;
483
484 LLVM_DEBUG({
485 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
486 << " keyed on " << EqClassKey << ":\n";
487 for (Instruction *I : EqClass)
488 dbgs() << " " << *I << "\n";
489 });
490
491 std::vector<Chain> Chains = gatherChains(EqClass);
492 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
493 << " nontrivial chains.\n";);
494 for (Chain &C : Chains)
495 Changed |= runOnChain(C);
496 return Changed;
497}
498
499bool Vectorizer::runOnChain(Chain &C) {
500 LLVM_DEBUG({
501 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
502 dumpChain(C);
503 });
504
505 // Split up the chain into increasingly smaller chains, until we can finally
506 // vectorize the chains.
507 //
508 // (Don't be scared by the depth of the loop nest here. These operations are
509 // all at worst O(n lg n) in the number of instructions, and splitting chains
510 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
511 bool Changed = false;
512 for (auto &C : splitChainByMayAliasInstrs(C))
513 for (auto &C : splitChainByContiguity(C))
514 for (auto &C : splitChainByAlignment(C))
515 Changed |= vectorizeChain(C);
516 return Changed;
517}
518
519std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
520 if (C.empty())
521 return {};
522
523 sortChainInBBOrder(C);
524
525 LLVM_DEBUG({
526 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
527 dumpChain(C);
528 });
529
530 // We know that elements in the chain with nonverlapping offsets can't
531 // alias, but AA may not be smart enough to figure this out. Use a
532 // hashtable.
533 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
534 for (const auto &E : C)
535 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
536
537 // Loads get hoisted up to the first load in the chain. Stores get sunk
538 // down to the last store in the chain. Our algorithm for loads is:
539 //
540 // - Take the first element of the chain. This is the start of a new chain.
541 // - Take the next element of `Chain` and check for may-alias instructions
542 // up to the start of NewChain. If no may-alias instrs, add it to
543 // NewChain. Otherwise, start a new NewChain.
544 //
545 // For stores it's the same except in the reverse direction.
546 //
547 // We expect IsLoad to be an std::bool_constant.
548 auto Impl = [&](auto IsLoad) {
549 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
550 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
551 if constexpr (IsLoad())
552 return std::make_pair(C.begin(), C.end());
553 else
554 return std::make_pair(C.rbegin(), C.rend());
555 }(IsLoad);
556 assert(ChainBegin != ChainEnd);
557
558 std::vector<Chain> Chains;
560 NewChain.push_back(*ChainBegin);
561 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
562 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
563 ChainOffsets)) {
564 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
565 << *ChainIt->Inst << " into " << *ChainBegin->Inst
566 << "\n");
567 NewChain.push_back(*ChainIt);
568 } else {
570 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
571 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
572 if (NewChain.size() > 1) {
573 LLVM_DEBUG({
574 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
575 dumpChain(NewChain);
576 });
577 Chains.push_back(std::move(NewChain));
578 }
579
580 // Start a new chain.
581 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
582 }
583 }
584 if (NewChain.size() > 1) {
585 LLVM_DEBUG({
586 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
587 dumpChain(NewChain);
588 });
589 Chains.push_back(std::move(NewChain));
590 }
591 return Chains;
592 };
593
594 if (isa<LoadInst>(C[0].Inst))
595 return Impl(/*IsLoad=*/std::bool_constant<true>());
596
597 assert(isa<StoreInst>(C[0].Inst));
598 return Impl(/*IsLoad=*/std::bool_constant<false>());
599}
600
601std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
602 if (C.empty())
603 return {};
604
605 sortChainInOffsetOrder(C);
606
607 LLVM_DEBUG({
608 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
609 dumpChain(C);
610 });
611
612 std::vector<Chain> Ret;
613 Ret.push_back({C.front()});
614
615 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
616 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
617 auto &CurChain = Ret.back();
618 const ChainElem &Prev = CurChain.back();
619 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
620 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
621 "collectEquivalenceClass");
622 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
623
624 // Add this instruction to the end of the current chain, or start a new one.
625 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
626 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
627 << (AreContiguous ? "" : "not ") << "contiguous: "
628 << *Prev.Inst << " (ends at offset " << PrevReadEnd
629 << ") -> " << *It->Inst << " (starts at offset "
630 << It->OffsetFromLeader << ")\n");
631 if (AreContiguous)
632 CurChain.push_back(*It);
633 else
634 Ret.push_back({*It});
635 }
636
637 // Filter out length-1 chains, these are uninteresting.
638 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
639 return Ret;
640}
641
642Type *Vectorizer::getChainElemTy(const Chain &C) {
643 assert(!C.empty());
644 // The rules are:
645 // - If there are any pointer types in the chain, use an integer type.
646 // - Prefer an integer type if it appears in the chain.
647 // - Otherwise, use the first type in the chain.
648 //
649 // The rule about pointer types is a simplification when we merge e.g. a load
650 // of a ptr and a double. There's no direct conversion from a ptr to a
651 // double; it requires a ptrtoint followed by a bitcast.
652 //
653 // It's unclear to me if the other rules have any practical effect, but we do
654 // it to match this pass's previous behavior.
655 if (any_of(C, [](const ChainElem &E) {
656 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
657 })) {
658 return Type::getIntNTy(
659 F.getContext(),
660 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
661 }
662
663 for (const ChainElem &E : C)
664 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
665 return T;
666 return getLoadStoreType(C[0].Inst)->getScalarType();
667}
668
669std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
670 // We use a simple greedy algorithm.
671 // - Given a chain of length N, find all prefixes that
672 // (a) are not longer than the max register length, and
673 // (b) are a power of 2.
674 // - Starting from the longest prefix, try to create a vector of that length.
675 // - If one of them works, great. Repeat the algorithm on any remaining
676 // elements in the chain.
677 // - If none of them work, discard the first element and repeat on a chain
678 // of length N-1.
679 if (C.empty())
680 return {};
681
682 sortChainInOffsetOrder(C);
683
684 LLVM_DEBUG({
685 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
686 dumpChain(C);
687 });
688
689 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
690 auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
691 unsigned ChainSizeBytes, VectorType *VecTy) {
692 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
693 ChainSizeBytes, VecTy)
694 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
695 ChainSizeBytes, VecTy);
696 };
697
698#ifndef NDEBUG
699 for (const auto &E : C) {
700 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
701 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
702 "Should have filtered out non-power-of-two elements in "
703 "collectEquivalenceClasses.");
704 }
705#endif
706
707 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
708 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
709
710 std::vector<Chain> Ret;
711 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
712 // Find candidate chains of size not greater than the largest vector reg.
713 // These chains are over the closed interval [CBegin, CEnd].
714 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
715 CandidateChains;
716 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
717 APInt Sz = C[CEnd].OffsetFromLeader +
718 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
719 C[CBegin].OffsetFromLeader;
720 if (Sz.sgt(VecRegBytes))
721 break;
722 CandidateChains.push_back(
723 {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
724 }
725
726 // Consider the longest chain first.
727 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
728 It != End; ++It) {
729 auto [CEnd, SizeBytes] = *It;
731 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
732 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
733
734 Type *VecElemTy = getChainElemTy(C);
735 // Note, VecElemTy is a power of 2, but might be less than one byte. For
736 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
737 // VecElemTy would be i4.
738 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
739
740 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
741 assert((8 * SizeBytes) % VecElemBits == 0);
742 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
743 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
744 unsigned VF = 8 * VecRegBytes / VecElemBits;
745
746 // Check that TTI is happy with this vectorization factor.
747 unsigned TargetVF = getVectorFactor(VF, VecElemBits,
748 VecElemBits * NumVecElems / 8, VecTy);
749 if (TargetVF != VF && TargetVF < NumVecElems) {
751 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
752 "because TargetVF="
753 << TargetVF << " != VF=" << VF
754 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
755 continue;
756 }
757
758 // Is a load/store with this alignment allowed by TTI and at least as fast
759 // as an unvectorized load/store?
760 //
761 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
762 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
763 &F = F](Align Alignment) {
764 if (Alignment.value() % SizeBytes == 0)
765 return true;
766 unsigned VectorizedSpeed = 0;
767 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
768 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
769 if (!AllowsMisaligned) {
771 << "LSV: Access of " << SizeBytes << "B in addrspace "
772 << AS << " with alignment " << Alignment.value()
773 << " is misaligned, and therefore can't be vectorized.\n");
774 return false;
775 }
776
777 unsigned ElementwiseSpeed = 0;
778 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
779 Alignment, &ElementwiseSpeed);
780 if (VectorizedSpeed < ElementwiseSpeed) {
782 << "LSV: Access of " << SizeBytes << "B in addrspace "
783 << AS << " with alignment " << Alignment.value()
784 << " has relative speed " << VectorizedSpeed
785 << ", which is lower than the elementwise speed of "
786 << ElementwiseSpeed
787 << ". Therefore this access won't be vectorized.\n");
788 return false;
789 }
790 return true;
791 };
792
793 // If we're loading/storing from an alloca, align it if possible.
794 //
795 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
796 // tells us this is beneficial. This feels a bit odd, but it matches
797 // existing tests. This isn't *so* bad, because at most we align to 4
798 // bytes (current value of StackAdjustedAlignment).
799 //
800 // FIXME: We will upgrade the alignment of the alloca even if it turns out
801 // we can't vectorize for some other reason.
802 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
803 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
804 isa<AllocaInst>(PtrOperand->stripPointerCasts());
805 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
806 Align PrefAlign = Align(StackAdjustedAlignment);
807 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
808 IsAllowedAndFast(PrefAlign)) {
810 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
811 if (NewAlign >= Alignment) {
813 << "LSV: splitByChain upgrading alloca alignment from "
814 << Alignment.value() << " to " << NewAlign.value()
815 << "\n");
816 Alignment = NewAlign;
817 }
818 }
819
820 if (!IsAllowedAndFast(Alignment)) {
822 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
823 "because its alignment is not AllowedAndFast: "
824 << Alignment.value() << "\n");
825 continue;
826 }
827
828 if ((IsLoadChain &&
829 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
830 (!IsLoadChain &&
831 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
833 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
834 "because !isLegalToVectorizeLoad/StoreChain.");
835 continue;
836 }
837
838 // Hooray, we can vectorize this chain!
839 Chain &NewChain = Ret.emplace_back();
840 for (unsigned I = CBegin; I <= CEnd; ++I)
841 NewChain.push_back(C[I]);
842 CBegin = CEnd; // Skip over the instructions we've added to the chain.
843 break;
844 }
845 }
846 return Ret;
847}
848
849bool Vectorizer::vectorizeChain(Chain &C) {
850 if (C.size() < 2)
851 return false;
852
853 sortChainInOffsetOrder(C);
854
855 LLVM_DEBUG({
856 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
857 dumpChain(C);
858 });
859
860 Type *VecElemTy = getChainElemTy(C);
861 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
862 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
863 unsigned ChainBytes = std::accumulate(
864 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
865 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
866 });
867 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
868 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
869 // than 1 byte (e.g. VecTy == <32 x i1>).
870 Type *VecTy = FixedVectorType::get(
871 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
872
873 Align Alignment = getLoadStoreAlignment(C[0].Inst);
874 // If this is a load/store of an alloca, we might have upgraded the alloca's
875 // alignment earlier. Get the new alignment.
876 if (AS == DL.getAllocaAddrSpace()) {
877 Alignment = std::max(
878 Alignment,
880 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
881 }
882
883 // All elements of the chain must have the same scalar-type size.
884#ifndef NDEBUG
885 for (const ChainElem &E : C)
886 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
887 DL.getTypeStoreSize(VecElemTy));
888#endif
889
890 Instruction *VecInst;
891 if (IsLoadChain) {
892 // Loads get hoisted to the location of the first load in the chain. We may
893 // also need to hoist the (transitive) operands of the loads.
894 Builder.SetInsertPoint(
895 llvm::min_element(C, [](const auto &A, const auto &B) {
896 return A.Inst->comesBefore(B.Inst);
897 })->Inst);
898
899 // Chain is in offset order, so C[0] is the instr with the lowest offset,
900 // i.e. the root of the vector.
901 VecInst = Builder.CreateAlignedLoad(VecTy,
903 Alignment);
904
905 unsigned VecIdx = 0;
906 for (const ChainElem &E : C) {
907 Instruction *I = E.Inst;
908 Value *V;
910 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
911 auto Mask = llvm::to_vector<8>(
912 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
913 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
914 VecIdx += VT->getNumElements();
915 } else {
916 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
917 I->getName());
918 ++VecIdx;
919 }
920 if (V->getType() != I->getType())
921 V = Builder.CreateBitOrPointerCast(V, I->getType());
922 I->replaceAllUsesWith(V);
923 }
924
925 // Finally, we need to reorder the instrs in the BB so that the (transitive)
926 // operands of VecInst appear before it. To see why, suppose we have
927 // vectorized the following code:
928 //
929 // ptr1 = gep a, 1
930 // load1 = load i32 ptr1
931 // ptr0 = gep a, 0
932 // load0 = load i32 ptr0
933 //
934 // We will put the vectorized load at the location of the earliest load in
935 // the BB, i.e. load1. We get:
936 //
937 // ptr1 = gep a, 1
938 // loadv = load <2 x i32> ptr0
939 // load0 = extractelement loadv, 0
940 // load1 = extractelement loadv, 1
941 // ptr0 = gep a, 0
942 //
943 // Notice that loadv uses ptr0, which is defined *after* it!
944 reorder(VecInst);
945 } else {
946 // Stores get sunk to the location of the last store in the chain.
947 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
948 return A.Inst->comesBefore(B.Inst);
949 })->Inst);
950
951 // Build the vector to store.
952 Value *Vec = PoisonValue::get(VecTy);
953 unsigned VecIdx = 0;
954 auto InsertElem = [&](Value *V) {
955 if (V->getType() != VecElemTy)
956 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
957 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
958 };
959 for (const ChainElem &E : C) {
960 auto I = cast<StoreInst>(E.Inst);
961 if (FixedVectorType *VT =
962 dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
963 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
964 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
965 Builder.getInt32(J)));
966 }
967 } else {
968 InsertElem(I->getValueOperand());
969 }
970 }
971
972 // Chain is in offset order, so C[0] is the instr with the lowest offset,
973 // i.e. the root of the vector.
974 VecInst = Builder.CreateAlignedStore(
975 Vec,
977 Alignment);
978 }
979
980 propagateMetadata(VecInst, C);
981
982 for (const ChainElem &E : C)
983 ToErase.push_back(E.Inst);
984
985 ++NumVectorInstructions;
986 NumScalarsVectorized += C.size();
987 return true;
988}
989
990template <bool IsLoadChain>
991bool Vectorizer::isSafeToMove(
992 Instruction *ChainElem, Instruction *ChainBegin,
994 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
995 << *ChainBegin << ")\n");
996
997 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
998 if (ChainElem == ChainBegin)
999 return true;
1000
1001 // Invariant loads can always be reordered; by definition they are not
1002 // clobbered by stores.
1003 if (isInvariantLoad(ChainElem))
1004 return true;
1005
1006 auto BBIt = std::next([&] {
1007 if constexpr (IsLoadChain)
1008 return BasicBlock::reverse_iterator(ChainElem);
1009 else
1010 return BasicBlock::iterator(ChainElem);
1011 }());
1012 auto BBItEnd = std::next([&] {
1013 if constexpr (IsLoadChain)
1014 return BasicBlock::reverse_iterator(ChainBegin);
1015 else
1016 return BasicBlock::iterator(ChainBegin);
1017 }());
1018
1019 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1020 const unsigned ChainElemSize =
1021 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1022
1023 for (; BBIt != BBItEnd; ++BBIt) {
1024 Instruction *I = &*BBIt;
1025
1026 if (!I->mayReadOrWriteMemory())
1027 continue;
1028
1029 // Loads can be reordered with other loads.
1030 if (IsLoadChain && isa<LoadInst>(I))
1031 continue;
1032
1033 // Stores can be sunk below invariant loads.
1034 if (!IsLoadChain && isInvariantLoad(I))
1035 continue;
1036
1037 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1038 // what offset ChainIt accesses. This may be better than AA is able to do.
1039 //
1040 // We should really only have duplicate offsets for stores (the duplicate
1041 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1042 // split the chain so we don't have to handle this case specially.
1043 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1044 // I and ChainElem overlap if:
1045 // - I and ChainElem have the same offset, OR
1046 // - I's offset is less than ChainElem's, but I touches past the
1047 // beginning of ChainElem, OR
1048 // - ChainElem's offset is less than I's, but ChainElem touches past the
1049 // beginning of I.
1050 const APInt &IOffset = OffsetIt->second;
1051 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1052 if (IOffset == ChainElemOffset ||
1053 (IOffset.sle(ChainElemOffset) &&
1054 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1055 (ChainElemOffset.sle(IOffset) &&
1056 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1057 LLVM_DEBUG({
1058 // Double check that AA also sees this alias. If not, we probably
1059 // have a bug.
1060 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1061 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1062 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1063 });
1064 return false; // We found an aliasing instruction; bail.
1065 }
1066
1067 continue; // We're confident there's no alias.
1068 }
1069
1070 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1071 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1072 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1073 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1074 << " Aliasing instruction:\n"
1075 << " " << *I << '\n'
1076 << " Aliased instruction and pointer:\n"
1077 << " " << *ChainElem << '\n'
1078 << " " << *getLoadStorePointerOperand(ChainElem)
1079 << '\n');
1080
1081 return false;
1082 }
1083 }
1084 return true;
1085}
1086
1088 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1089 return (Signed && BinOpI->hasNoSignedWrap()) ||
1090 (!Signed && BinOpI->hasNoUnsignedWrap());
1091}
1092
1093static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1094 unsigned MatchingOpIdxA, Instruction *AddOpB,
1095 unsigned MatchingOpIdxB, bool Signed) {
1096 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1097 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1098 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1099 << ", MatchingOpIdxB=" << MatchingOpIdxB
1100 << ", Signed=" << Signed << "\n");
1101 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1102 // being the same, we can guarantee that the transformation is safe if we can
1103 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1104 // For example:
1105 // %tmp7 = add nsw i32 %tmp2, %v0
1106 // %tmp8 = sext i32 %tmp7 to i64
1107 // ...
1108 // %tmp11 = add nsw i32 %v0, 1
1109 // %tmp12 = add nsw i32 %tmp2, %tmp11
1110 // %tmp13 = sext i32 %tmp12 to i64
1111 //
1112 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1113 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1114 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1115 assert(AddOpA->getOpcode() == Instruction::Add &&
1116 AddOpB->getOpcode() == Instruction::Add &&
1117 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1118 if (AddOpA->getOperand(MatchingOpIdxA) ==
1119 AddOpB->getOperand(MatchingOpIdxB)) {
1120 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1121 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1122 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1123 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1124 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1125 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1126 checkNoWrapFlags(OtherInstrB, Signed) &&
1127 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1128 int64_t CstVal =
1129 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1130 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1131 IdxDiff.getSExtValue() == CstVal)
1132 return true;
1133 }
1134 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1135 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1136 checkNoWrapFlags(OtherInstrA, Signed) &&
1137 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1138 int64_t CstVal =
1139 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1140 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1141 IdxDiff.getSExtValue() == -CstVal)
1142 return true;
1143 }
1144 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1145 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1146 if (OtherInstrA && OtherInstrB &&
1147 OtherInstrA->getOpcode() == Instruction::Add &&
1148 OtherInstrB->getOpcode() == Instruction::Add &&
1149 checkNoWrapFlags(OtherInstrA, Signed) &&
1150 checkNoWrapFlags(OtherInstrB, Signed) &&
1151 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1152 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1153 int64_t CstValA =
1154 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1155 int64_t CstValB =
1156 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1157 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1158 IdxDiff.getSExtValue() == (CstValB - CstValA))
1159 return true;
1160 }
1161 }
1162 return false;
1163}
1164
1165std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1166 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1167 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1168 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1169 << " Depth=" << Depth << "\n");
1170 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1171 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1172 if (!GEPA || !GEPB)
1173 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1174
1175 // Look through GEPs after checking they're the same except for the last
1176 // index.
1177 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1178 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1179 return std::nullopt;
1180 gep_type_iterator GTIA = gep_type_begin(GEPA);
1181 gep_type_iterator GTIB = gep_type_begin(GEPB);
1182 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1183 if (GTIA.getOperand() != GTIB.getOperand())
1184 return std::nullopt;
1185 ++GTIA;
1186 ++GTIB;
1187 }
1188
1189 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1190 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1191 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1192 OpA->getType() != OpB->getType())
1193 return std::nullopt;
1194
1195 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1196
1197 // Only look through a ZExt/SExt.
1198 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1199 return std::nullopt;
1200
1201 bool Signed = isa<SExtInst>(OpA);
1202
1203 // At this point A could be a function parameter, i.e. not an instruction
1204 Value *ValA = OpA->getOperand(0);
1205 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1206 if (!OpB || ValA->getType() != OpB->getType())
1207 return std::nullopt;
1208
1209 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1210 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1211 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1212 if (IdxDiffSCEV == SE.getCouldNotCompute())
1213 return std::nullopt;
1214
1215 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1216 if (!IdxDiffRange.isSingleElement())
1217 return std::nullopt;
1218 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1219
1220 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1221 << "\n");
1222
1223 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1224 bool Safe = false;
1225
1226 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1227 // ValA, we're okay.
1228 if (OpB->getOpcode() == Instruction::Add &&
1229 isa<ConstantInt>(OpB->getOperand(1)) &&
1230 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1232 Safe = true;
1233
1234 // Second attempt: check if we have eligible add NSW/NUW instruction
1235 // sequences.
1236 OpA = dyn_cast<Instruction>(ValA);
1237 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1238 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1239 checkNoWrapFlags(OpB, Signed)) {
1240 // In the checks below a matching operand in OpA and OpB is an operand which
1241 // is the same in those two instructions. Below we account for possible
1242 // orders of the operands of these add instructions.
1243 for (unsigned MatchingOpIdxA : {0, 1})
1244 for (unsigned MatchingOpIdxB : {0, 1})
1245 if (!Safe)
1246 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1247 MatchingOpIdxB, Signed);
1248 }
1249
1250 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1251
1252 // Third attempt:
1253 //
1254 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1255 // order bit other than the sign bit are known to be zero in ValA, we can add
1256 // Diff to it while guaranteeing no overflow of any sort.
1257 //
1258 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1259 if (!Safe) {
1260 // When computing known bits, use the GEPs as context instructions, since
1261 // they likely are in the same BB as the load/store.
1262 KnownBits Known(BitWidth);
1263 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1264 ContextInst, &DT);
1265 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1266 if (Signed)
1267 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1268 if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1269 return std::nullopt;
1270 Safe = true;
1271 }
1272
1273 if (Safe)
1274 return IdxDiff * Stride;
1275 return std::nullopt;
1276}
1277
1278std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1279 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1280 if (Depth++ == MaxDepth)
1281 return std::nullopt;
1282
1283 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1284 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1285 if (SelectA->getCondition() != SelectB->getCondition())
1286 return std::nullopt;
1287 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1288 << ", PtrB=" << *PtrB << ", ContextInst="
1289 << *ContextInst << ", Depth=" << Depth << "\n");
1290 std::optional<APInt> TrueDiff = getConstantOffset(
1291 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1292 if (!TrueDiff.has_value())
1293 return std::nullopt;
1294 std::optional<APInt> FalseDiff =
1295 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1296 ContextInst, Depth);
1297 if (TrueDiff == FalseDiff)
1298 return TrueDiff;
1299 }
1300 }
1301 return std::nullopt;
1302}
1303
1304EquivalenceClassMap
1305Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1307 EquivalenceClassMap Ret;
1308
1309 auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
1310 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1311 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1312 // The select's themselves are distinct instructions even if they share
1313 // the same condition and evaluate to consecutive pointers for true and
1314 // false values of the condition. Therefore using the select's themselves
1315 // for grouping instructions would put consecutive accesses into different
1316 // lists and they won't be even checked for being consecutive, and won't
1317 // be vectorized.
1318 return Sel->getCondition();
1319 }
1320 return ObjPtr;
1321 };
1322
1323 for (Instruction &I : make_range(Begin, End)) {
1324 auto *LI = dyn_cast<LoadInst>(&I);
1325 auto *SI = dyn_cast<StoreInst>(&I);
1326 if (!LI && !SI)
1327 continue;
1328
1329 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1330 continue;
1331
1332 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1333 (SI && !TTI.isLegalToVectorizeStore(SI)))
1334 continue;
1335
1336 Type *Ty = getLoadStoreType(&I);
1338 continue;
1339
1340 // Skip weird non-byte sizes. They probably aren't worth the effort of
1341 // handling correctly.
1342 unsigned TySize = DL.getTypeSizeInBits(Ty);
1343 if ((TySize % 8) != 0)
1344 continue;
1345
1346 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1347 // functions are currently using an integer type for the vectorized
1348 // load/store, and does not support casting between the integer type and a
1349 // vector of pointers (e.g. i64 to <2 x i16*>)
1350 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1351 continue;
1352
1354 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1355 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1356
1357 unsigned VF = VecRegSize / TySize;
1358 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1359
1360 // Only handle power-of-two sized elements.
1361 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1362 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1363 continue;
1364
1365 // No point in looking at these if they're too big to vectorize.
1366 if (TySize > VecRegSize / 2 ||
1367 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1368 continue;
1369
1371 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1372 /*IsLoad=*/LI != nullptr}]
1373 .push_back(&I);
1374 }
1375
1376 return Ret;
1377}
1378
1379std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1380 if (Instrs.empty())
1381 return {};
1382
1383 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1384 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1385
1386#ifndef NDEBUG
1387 // Check that Instrs is in BB order and all have the same addr space.
1388 for (size_t I = 1; I < Instrs.size(); ++I) {
1389 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1390 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1391 }
1392#endif
1393
1394 // Machinery to build an MRU-hashtable of Chains.
1395 //
1396 // (Ideally this could be done with MapVector, but as currently implemented,
1397 // moving an element to the front of a MapVector is O(n).)
1398 struct InstrListElem : ilist_node<InstrListElem>,
1399 std::pair<Instruction *, Chain> {
1400 explicit InstrListElem(Instruction *I)
1401 : std::pair<Instruction *, Chain>(I, {}) {}
1402 };
1403 struct InstrListElemDenseMapInfo {
1404 using PtrInfo = DenseMapInfo<InstrListElem *>;
1405 using IInfo = DenseMapInfo<Instruction *>;
1406 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1407 static InstrListElem *getTombstoneKey() {
1408 return PtrInfo::getTombstoneKey();
1409 }
1410 static unsigned getHashValue(const InstrListElem *E) {
1411 return IInfo::getHashValue(E->first);
1412 }
1413 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1414 if (A == getEmptyKey() || B == getEmptyKey())
1415 return A == getEmptyKey() && B == getEmptyKey();
1416 if (A == getTombstoneKey() || B == getTombstoneKey())
1417 return A == getTombstoneKey() && B == getTombstoneKey();
1418 return IInfo::isEqual(A->first, B->first);
1419 }
1420 };
1424
1425 // Compare each instruction in `instrs` to leader of the N most recently-used
1426 // chains. This limits the O(n^2) behavior of this pass while also allowing
1427 // us to build arbitrarily long chains.
1428 for (Instruction *I : Instrs) {
1429 constexpr int MaxChainsToTry = 64;
1430
1431 bool MatchFound = false;
1432 auto ChainIter = MRU.begin();
1433 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1434 ++J, ++ChainIter) {
1435 std::optional<APInt> Offset = getConstantOffset(
1436 getLoadStorePointerOperand(ChainIter->first),
1438 /*ContextInst=*/
1439 (ChainIter->first->comesBefore(I) ? I : ChainIter->first));
1440 if (Offset.has_value()) {
1441 // `Offset` might not have the expected number of bits, if e.g. AS has a
1442 // different number of bits than opaque pointers.
1443 ChainIter->second.push_back(ChainElem{I, Offset.value()});
1444 // Move ChainIter to the front of the MRU list.
1445 MRU.remove(*ChainIter);
1446 MRU.push_front(*ChainIter);
1447 MatchFound = true;
1448 break;
1449 }
1450 }
1451
1452 if (!MatchFound) {
1453 APInt ZeroOffset(ASPtrBits, 0);
1454 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1455 E->second.push_back(ChainElem{I, ZeroOffset});
1456 MRU.push_front(*E);
1457 Chains.insert(E);
1458 }
1459 }
1460
1461 std::vector<Chain> Ret;
1462 Ret.reserve(Chains.size());
1463 // Iterate over MRU rather than Chains so the order is deterministic.
1464 for (auto &E : MRU)
1465 if (E.second.size() > 1)
1466 Ret.push_back(std::move(E.second));
1467 return Ret;
1468}
1469
1470std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1471 Instruction *ContextInst,
1472 unsigned Depth) {
1473 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1474 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1475 << ", Depth=" << Depth << "\n");
1476 // We'll ultimately return a value of this bit width, even if computations
1477 // happen in a different width.
1478 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1479 APInt OffsetA(OrigBitWidth, 0);
1480 APInt OffsetB(OrigBitWidth, 0);
1481 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1482 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1483 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1484 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1485 return std::nullopt;
1486
1487 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1488 // should properly handle a possible overflow and the value should fit into
1489 // the smallest data type used in the cast/gep chain.
1490 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1491 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1492
1493 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1494 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1495 if (PtrA == PtrB)
1496 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1497
1498 // Try to compute B - A.
1499 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1500 if (DistScev != SE.getCouldNotCompute()) {
1501 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1502 ConstantRange DistRange = SE.getSignedRange(DistScev);
1503 if (DistRange.isSingleElement()) {
1504 // Handle index width (the width of Dist) != pointer width (the width of
1505 // the Offset*s at this point).
1506 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1507 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1508 }
1509 }
1510 std::optional<APInt> Diff =
1511 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1512 if (Diff.has_value())
1513 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1514 .sextOrTrunc(OrigBitWidth);
1515 return std::nullopt;
1516}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMDGPU Mark last scratch load
This file implements a class to represent arbitrary precision integral constant values and operations...
Expand Atomic instructions
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static const unsigned MaxDepth
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, const MachineRegisterInfo &MRI)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:77
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1386
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt abs() const
Get the absolute value.
Definition: APInt.h:1752
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1180
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1090
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1145
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:454
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1216
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
This class represents a range of values.
Definition: ConstantRange.h:47
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:341
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
An instruction for reading from memory.
Definition: Instructions.h:185
bool isSimple() const
Definition: Instructions.h:273
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getCouldNotCompute()
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:387
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalToVectorizeLoad(LoadInst *LI) const
bool isLegalToVectorizeStore(StoreInst *SI) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:736
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:693
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
TypeSize getSequentialElementStride(const DataLayout &DL) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A simple intrusive list implementation.
Definition: simple_ilist.h:81
void push_front(reference Node)
Insert a node at the front; never copies.
Definition: simple_ilist.h:150
void remove(reference N)
Remove a node by reference; never deletes.
Definition: simple_ilist.h:189
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
auto min_element(R &&Range)
Definition: STLExtras.h:1978
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
unsigned getLoadStoreAddressSpace(Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1543
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
Align getLoadStoreAlignment(Value *I)
A helper function that returns the alignment of load or store instruction.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
TargetTransformInfo TTI
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
auto max_element(R &&Range)
Definition: STLExtras.h:1986
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:293
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117