LLVM 20.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
191 for (const ChainElem &E : C)
192 Values.emplace_back(E.Inst);
193 return propagateMetadata(I, Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
206
207 Worklist.emplace_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
212 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(Idx));
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(I)) {
224 InstructionsToMove.insert(IM);
225 Worklist.emplace_back(IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(IM))
234 continue;
235 IM->moveBefore(I);
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 // We could erase instrs right after vectorizing them, but that can mess up
250 // our BB iterators, and also can make the equivalence class keys point to
251 // freed memory. This is fixable, but it's simpler just to wait until we're
252 // done with the BB and erase all at once.
254
255public:
256 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
258 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
259 DL(F.getDataLayout()), Builder(SE.getContext()) {}
260
261 bool run();
262
263private:
264 static const unsigned MaxDepth = 3;
265
266 /// Runs the vectorizer on a "pseudo basic block", which is a range of
267 /// instructions [Begin, End) within one BB all of which have
268 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
269 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
270
271 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
272 /// in the same BB with the same value for getUnderlyingObject() etc.
273 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
275
276 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
277 /// where all instructions access a known, constant offset from the first
278 /// instruction.
279 bool runOnChain(Chain &C);
280
281 /// Splits the chain into subchains of instructions which read/write a
282 /// contiguous block of memory. Discards any length-1 subchains (because
283 /// there's nothing to vectorize in there).
284 std::vector<Chain> splitChainByContiguity(Chain &C);
285
286 /// Splits the chain into subchains where it's safe to hoist loads up to the
287 /// beginning of the sub-chain and it's safe to sink loads up to the end of
288 /// the sub-chain. Discards any length-1 subchains.
289 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
290
291 /// Splits the chain into subchains that make legal, aligned accesses.
292 /// Discards any length-1 subchains.
293 std::vector<Chain> splitChainByAlignment(Chain &C);
294
295 /// Converts the instrs in the chain into a single vectorized load or store.
296 /// Adds the old scalar loads/stores to ToErase.
297 bool vectorizeChain(Chain &C);
298
299 /// Tries to compute the offset in bytes PtrB - PtrA.
300 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
301 Instruction *ContextInst,
302 unsigned Depth = 0);
303 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
304 Instruction *ContextInst,
305 unsigned Depth);
306 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth);
309
310 /// Gets the element type of the vector that the chain will load or store.
311 /// This is nontrivial because the chain may contain elements of different
312 /// types; e.g. it's legal to have a chain that contains both i32 and float.
313 Type *getChainElemTy(const Chain &C);
314
315 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
316 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
317 /// instructions.
318 ///
319 /// The map ChainElemOffsets must contain all of the elements in
320 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321 /// address. It's ok if it contains additional entries.
322 template <bool IsLoadChain>
323 bool isSafeToMove(
324 Instruction *ChainElem, Instruction *ChainBegin,
326
327 /// Merges the equivalence classes if they have underlying objects that differ
328 /// by one level of indirection (i.e., one is a getelementptr and the other is
329 /// the base pointer in that getelementptr).
330 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
331
332 /// Collects loads and stores grouped by "equivalence class", where:
333 /// - all elements in an eq class are a load or all are a store,
334 /// - they all load/store the same element size (it's OK to have e.g. i8 and
335 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
336 /// - they all have the same value for getUnderlyingObject().
337 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
339
340 /// Partitions Instrs into "chains" where every instruction has a known
341 /// constant offset from the first instr in the chain.
342 ///
343 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
344 /// in the chain is the leader, and an instr touches distance 0 from itself.
345 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
346};
347
348class LoadStoreVectorizerLegacyPass : public FunctionPass {
349public:
350 static char ID;
351
352 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
355 }
356
357 bool runOnFunction(Function &F) override;
358
359 StringRef getPassName() const override {
360 return "GPU Load and Store Vectorizer";
361 }
362
363 void getAnalysisUsage(AnalysisUsage &AU) const override {
369 AU.setPreservesCFG();
370 }
371};
372
373} // end anonymous namespace
374
375char LoadStoreVectorizerLegacyPass::ID = 0;
376
377INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
378 "Vectorize load and Store instructions", false, false)
385INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
386 "Vectorize load and store instructions", false, false)
387
389 return new LoadStoreVectorizerLegacyPass();
390}
391
392bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
393 // Don't vectorize when the attribute NoImplicitFloat is used.
394 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
395 return false;
396
397 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
398 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
399 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
401 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
402
403 AssumptionCache &AC =
404 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
405
406 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
407}
408
411 // Don't vectorize when the attribute NoImplicitFloat is used.
412 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
413 return PreservedAnalyses::all();
414
420
421 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
424 return Changed ? PA : PreservedAnalyses::all();
425}
426
427bool Vectorizer::run() {
428 bool Changed = false;
429 // Break up the BB if there are any instrs which aren't guaranteed to transfer
430 // execution to their successor.
431 //
432 // Consider, for example:
433 //
434 // def assert_arr_len(int n) { if (n < 2) exit(); }
435 //
436 // load arr[0]
437 // call assert_array_len(arr.length)
438 // load arr[1]
439 //
440 // Even though assert_arr_len does not read or write any memory, we can't
441 // speculate the second load before the call. More info at
442 // https://github.com/llvm/llvm-project/issues/52950.
443 for (BasicBlock *BB : post_order(&F)) {
444 // BB must at least have a terminator.
445 assert(!BB->empty());
446
448 Barriers.emplace_back(BB->begin());
449 for (Instruction &I : *BB)
451 Barriers.emplace_back(I.getIterator());
452 Barriers.emplace_back(BB->end());
453
454 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
455 ++It)
456 Changed |= runOnPseudoBB(*It, *std::next(It));
457
458 for (Instruction *I : ToErase) {
459 auto *PtrOperand = getLoadStorePointerOperand(I);
460 if (I->use_empty())
461 I->eraseFromParent();
463 }
464 ToErase.clear();
465 }
466
467 return Changed;
468}
469
470bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
472 LLVM_DEBUG({
473 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
474 if (End != Begin->getParent()->end())
475 dbgs() << *End;
476 else
477 dbgs() << "<BB end>";
478 dbgs() << ")\n";
479 });
480
481 bool Changed = false;
482 for (const auto &[EqClassKey, EqClass] :
483 collectEquivalenceClasses(Begin, End))
484 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
485
486 return Changed;
487}
488
489bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
490 ArrayRef<Instruction *> EqClass) {
491 bool Changed = false;
492
493 LLVM_DEBUG({
494 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
495 << " keyed on " << EqClassKey << ":\n";
496 for (Instruction *I : EqClass)
497 dbgs() << " " << *I << "\n";
498 });
499
500 std::vector<Chain> Chains = gatherChains(EqClass);
501 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
502 << " nontrivial chains.\n";);
503 for (Chain &C : Chains)
504 Changed |= runOnChain(C);
505 return Changed;
506}
507
508bool Vectorizer::runOnChain(Chain &C) {
509 LLVM_DEBUG({
510 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
511 dumpChain(C);
512 });
513
514 // Split up the chain into increasingly smaller chains, until we can finally
515 // vectorize the chains.
516 //
517 // (Don't be scared by the depth of the loop nest here. These operations are
518 // all at worst O(n lg n) in the number of instructions, and splitting chains
519 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
520 bool Changed = false;
521 for (auto &C : splitChainByMayAliasInstrs(C))
522 for (auto &C : splitChainByContiguity(C))
523 for (auto &C : splitChainByAlignment(C))
524 Changed |= vectorizeChain(C);
525 return Changed;
526}
527
528std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
529 if (C.empty())
530 return {};
531
532 sortChainInBBOrder(C);
533
534 LLVM_DEBUG({
535 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
536 dumpChain(C);
537 });
538
539 // We know that elements in the chain with nonverlapping offsets can't
540 // alias, but AA may not be smart enough to figure this out. Use a
541 // hashtable.
542 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
543 for (const auto &E : C)
544 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
545
546 // Loads get hoisted up to the first load in the chain. Stores get sunk
547 // down to the last store in the chain. Our algorithm for loads is:
548 //
549 // - Take the first element of the chain. This is the start of a new chain.
550 // - Take the next element of `Chain` and check for may-alias instructions
551 // up to the start of NewChain. If no may-alias instrs, add it to
552 // NewChain. Otherwise, start a new NewChain.
553 //
554 // For stores it's the same except in the reverse direction.
555 //
556 // We expect IsLoad to be an std::bool_constant.
557 auto Impl = [&](auto IsLoad) {
558 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
559 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
560 if constexpr (IsLoad())
561 return std::make_pair(C.begin(), C.end());
562 else
563 return std::make_pair(C.rbegin(), C.rend());
564 }(IsLoad);
565 assert(ChainBegin != ChainEnd);
566
567 std::vector<Chain> Chains;
569 NewChain.emplace_back(*ChainBegin);
570 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
571 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
572 ChainOffsets)) {
573 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
574 << *ChainIt->Inst << " into " << *ChainBegin->Inst
575 << "\n");
576 NewChain.emplace_back(*ChainIt);
577 } else {
579 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
580 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
581 if (NewChain.size() > 1) {
582 LLVM_DEBUG({
583 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
584 dumpChain(NewChain);
585 });
586 Chains.emplace_back(std::move(NewChain));
587 }
588
589 // Start a new chain.
590 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
591 }
592 }
593 if (NewChain.size() > 1) {
594 LLVM_DEBUG({
595 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
596 dumpChain(NewChain);
597 });
598 Chains.emplace_back(std::move(NewChain));
599 }
600 return Chains;
601 };
602
603 if (isa<LoadInst>(C[0].Inst))
604 return Impl(/*IsLoad=*/std::bool_constant<true>());
605
606 assert(isa<StoreInst>(C[0].Inst));
607 return Impl(/*IsLoad=*/std::bool_constant<false>());
608}
609
610std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
611 if (C.empty())
612 return {};
613
614 sortChainInOffsetOrder(C);
615
616 LLVM_DEBUG({
617 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
618 dumpChain(C);
619 });
620
621 std::vector<Chain> Ret;
622 Ret.push_back({C.front()});
623
624 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
625 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
626 auto &CurChain = Ret.back();
627 const ChainElem &Prev = CurChain.back();
628 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
629 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
630 "collectEquivalenceClass");
631 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
632
633 // Add this instruction to the end of the current chain, or start a new one.
634 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
635 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
636 << (AreContiguous ? "" : "not ") << "contiguous: "
637 << *Prev.Inst << " (ends at offset " << PrevReadEnd
638 << ") -> " << *It->Inst << " (starts at offset "
639 << It->OffsetFromLeader << ")\n");
640 if (AreContiguous)
641 CurChain.push_back(*It);
642 else
643 Ret.push_back({*It});
644 }
645
646 // Filter out length-1 chains, these are uninteresting.
647 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
648 return Ret;
649}
650
651Type *Vectorizer::getChainElemTy(const Chain &C) {
652 assert(!C.empty());
653 // The rules are:
654 // - If there are any pointer types in the chain, use an integer type.
655 // - Prefer an integer type if it appears in the chain.
656 // - Otherwise, use the first type in the chain.
657 //
658 // The rule about pointer types is a simplification when we merge e.g. a load
659 // of a ptr and a double. There's no direct conversion from a ptr to a
660 // double; it requires a ptrtoint followed by a bitcast.
661 //
662 // It's unclear to me if the other rules have any practical effect, but we do
663 // it to match this pass's previous behavior.
664 if (any_of(C, [](const ChainElem &E) {
665 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
666 })) {
667 return Type::getIntNTy(
668 F.getContext(),
669 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
670 }
671
672 for (const ChainElem &E : C)
673 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
674 return T;
675 return getLoadStoreType(C[0].Inst)->getScalarType();
676}
677
678std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
679 // We use a simple greedy algorithm.
680 // - Given a chain of length N, find all prefixes that
681 // (a) are not longer than the max register length, and
682 // (b) are a power of 2.
683 // - Starting from the longest prefix, try to create a vector of that length.
684 // - If one of them works, great. Repeat the algorithm on any remaining
685 // elements in the chain.
686 // - If none of them work, discard the first element and repeat on a chain
687 // of length N-1.
688 if (C.empty())
689 return {};
690
691 sortChainInOffsetOrder(C);
692
693 LLVM_DEBUG({
694 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
695 dumpChain(C);
696 });
697
698 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
699 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
700 unsigned ChainSizeBytes, VectorType *VecTy) {
701 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
702 ChainSizeBytes, VecTy)
703 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
704 ChainSizeBytes, VecTy);
705 };
706
707#ifndef NDEBUG
708 for (const auto &E : C) {
709 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
710 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
711 "Should have filtered out non-power-of-two elements in "
712 "collectEquivalenceClasses.");
713 }
714#endif
715
716 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
717 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
718
719 std::vector<Chain> Ret;
720 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
721 // Find candidate chains of size not greater than the largest vector reg.
722 // These chains are over the closed interval [CBegin, CEnd].
723 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
724 CandidateChains;
725 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
726 APInt Sz = C[CEnd].OffsetFromLeader +
727 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
728 C[CBegin].OffsetFromLeader;
729 if (Sz.sgt(VecRegBytes))
730 break;
731 CandidateChains.emplace_back(CEnd,
732 static_cast<unsigned>(Sz.getLimitedValue()));
733 }
734
735 // Consider the longest chain first.
736 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
737 It != End; ++It) {
738 auto [CEnd, SizeBytes] = *It;
740 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
741 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
742
743 Type *VecElemTy = getChainElemTy(C);
744 // Note, VecElemTy is a power of 2, but might be less than one byte. For
745 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
746 // VecElemTy would be i4.
747 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
748
749 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
750 assert((8 * SizeBytes) % VecElemBits == 0);
751 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
752 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
753 unsigned VF = 8 * VecRegBytes / VecElemBits;
754
755 // Check that TTI is happy with this vectorization factor.
756 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
757 VecElemBits * NumVecElems / 8, VecTy);
758 if (TargetVF != VF && TargetVF < NumVecElems) {
760 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
761 "because TargetVF="
762 << TargetVF << " != VF=" << VF
763 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
764 continue;
765 }
766
767 // Is a load/store with this alignment allowed by TTI and at least as fast
768 // as an unvectorized load/store?
769 //
770 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
771 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
772 &F = F](Align Alignment) {
773 if (Alignment.value() % SizeBytes == 0)
774 return true;
775 unsigned VectorizedSpeed = 0;
776 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
777 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
778 if (!AllowsMisaligned) {
780 << "LSV: Access of " << SizeBytes << "B in addrspace "
781 << AS << " with alignment " << Alignment.value()
782 << " is misaligned, and therefore can't be vectorized.\n");
783 return false;
784 }
785
786 unsigned ElementwiseSpeed = 0;
787 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
788 Alignment, &ElementwiseSpeed);
789 if (VectorizedSpeed < ElementwiseSpeed) {
791 << "LSV: Access of " << SizeBytes << "B in addrspace "
792 << AS << " with alignment " << Alignment.value()
793 << " has relative speed " << VectorizedSpeed
794 << ", which is lower than the elementwise speed of "
795 << ElementwiseSpeed
796 << ". Therefore this access won't be vectorized.\n");
797 return false;
798 }
799 return true;
800 };
801
802 // If we're loading/storing from an alloca, align it if possible.
803 //
804 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
805 // tells us this is beneficial. This feels a bit odd, but it matches
806 // existing tests. This isn't *so* bad, because at most we align to 4
807 // bytes (current value of StackAdjustedAlignment).
808 //
809 // FIXME: We will upgrade the alignment of the alloca even if it turns out
810 // we can't vectorize for some other reason.
811 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
812 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
813 isa<AllocaInst>(PtrOperand->stripPointerCasts());
814 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
815 Align PrefAlign = Align(StackAdjustedAlignment);
816 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
817 IsAllowedAndFast(PrefAlign)) {
819 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
820 if (NewAlign >= Alignment) {
822 << "LSV: splitByChain upgrading alloca alignment from "
823 << Alignment.value() << " to " << NewAlign.value()
824 << "\n");
825 Alignment = NewAlign;
826 }
827 }
828
829 if (!IsAllowedAndFast(Alignment)) {
831 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
832 "because its alignment is not AllowedAndFast: "
833 << Alignment.value() << "\n");
834 continue;
835 }
836
837 if ((IsLoadChain &&
838 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
839 (!IsLoadChain &&
840 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
842 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
843 "because !isLegalToVectorizeLoad/StoreChain.");
844 continue;
845 }
846
847 // Hooray, we can vectorize this chain!
848 Chain &NewChain = Ret.emplace_back();
849 for (unsigned I = CBegin; I <= CEnd; ++I)
850 NewChain.emplace_back(C[I]);
851 CBegin = CEnd; // Skip over the instructions we've added to the chain.
852 break;
853 }
854 }
855 return Ret;
856}
857
858bool Vectorizer::vectorizeChain(Chain &C) {
859 if (C.size() < 2)
860 return false;
861
862 sortChainInOffsetOrder(C);
863
864 LLVM_DEBUG({
865 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
866 dumpChain(C);
867 });
868
869 Type *VecElemTy = getChainElemTy(C);
870 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
871 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
872 unsigned ChainBytes = std::accumulate(
873 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
874 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
875 });
876 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
877 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
878 // than 1 byte (e.g. VecTy == <32 x i1>).
879 Type *VecTy = FixedVectorType::get(
880 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
881
882 Align Alignment = getLoadStoreAlignment(C[0].Inst);
883 // If this is a load/store of an alloca, we might have upgraded the alloca's
884 // alignment earlier. Get the new alignment.
885 if (AS == DL.getAllocaAddrSpace()) {
886 Alignment = std::max(
887 Alignment,
889 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
890 }
891
892 // All elements of the chain must have the same scalar-type size.
893#ifndef NDEBUG
894 for (const ChainElem &E : C)
895 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
896 DL.getTypeStoreSize(VecElemTy));
897#endif
898
899 Instruction *VecInst;
900 if (IsLoadChain) {
901 // Loads get hoisted to the location of the first load in the chain. We may
902 // also need to hoist the (transitive) operands of the loads.
903 Builder.SetInsertPoint(
904 llvm::min_element(C, [](const auto &A, const auto &B) {
905 return A.Inst->comesBefore(B.Inst);
906 })->Inst);
907
908 // Chain is in offset order, so C[0] is the instr with the lowest offset,
909 // i.e. the root of the vector.
910 VecInst = Builder.CreateAlignedLoad(VecTy,
912 Alignment);
913
914 unsigned VecIdx = 0;
915 for (const ChainElem &E : C) {
916 Instruction *I = E.Inst;
917 Value *V;
919 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
920 auto Mask = llvm::to_vector<8>(
921 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
922 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
923 VecIdx += VT->getNumElements();
924 } else {
925 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
926 I->getName());
927 ++VecIdx;
928 }
929 if (V->getType() != I->getType())
930 V = Builder.CreateBitOrPointerCast(V, I->getType());
931 I->replaceAllUsesWith(V);
932 }
933
934 // Finally, we need to reorder the instrs in the BB so that the (transitive)
935 // operands of VecInst appear before it. To see why, suppose we have
936 // vectorized the following code:
937 //
938 // ptr1 = gep a, 1
939 // load1 = load i32 ptr1
940 // ptr0 = gep a, 0
941 // load0 = load i32 ptr0
942 //
943 // We will put the vectorized load at the location of the earliest load in
944 // the BB, i.e. load1. We get:
945 //
946 // ptr1 = gep a, 1
947 // loadv = load <2 x i32> ptr0
948 // load0 = extractelement loadv, 0
949 // load1 = extractelement loadv, 1
950 // ptr0 = gep a, 0
951 //
952 // Notice that loadv uses ptr0, which is defined *after* it!
953 reorder(VecInst);
954 } else {
955 // Stores get sunk to the location of the last store in the chain.
956 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
957 return A.Inst->comesBefore(B.Inst);
958 })->Inst);
959
960 // Build the vector to store.
961 Value *Vec = PoisonValue::get(VecTy);
962 unsigned VecIdx = 0;
963 auto InsertElem = [&](Value *V) {
964 if (V->getType() != VecElemTy)
965 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
966 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
967 };
968 for (const ChainElem &E : C) {
969 auto *I = cast<StoreInst>(E.Inst);
970 if (FixedVectorType *VT =
971 dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
972 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
973 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
974 Builder.getInt32(J)));
975 }
976 } else {
977 InsertElem(I->getValueOperand());
978 }
979 }
980
981 // Chain is in offset order, so C[0] is the instr with the lowest offset,
982 // i.e. the root of the vector.
983 VecInst = Builder.CreateAlignedStore(
984 Vec,
986 Alignment);
987 }
988
989 propagateMetadata(VecInst, C);
990
991 for (const ChainElem &E : C)
992 ToErase.emplace_back(E.Inst);
993
994 ++NumVectorInstructions;
995 NumScalarsVectorized += C.size();
996 return true;
997}
998
999template <bool IsLoadChain>
1000bool Vectorizer::isSafeToMove(
1001 Instruction *ChainElem, Instruction *ChainBegin,
1003 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1004 << *ChainBegin << ")\n");
1005
1006 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1007 if (ChainElem == ChainBegin)
1008 return true;
1009
1010 // Invariant loads can always be reordered; by definition they are not
1011 // clobbered by stores.
1012 if (isInvariantLoad(ChainElem))
1013 return true;
1014
1015 auto BBIt = std::next([&] {
1016 if constexpr (IsLoadChain)
1017 return BasicBlock::reverse_iterator(ChainElem);
1018 else
1019 return BasicBlock::iterator(ChainElem);
1020 }());
1021 auto BBItEnd = std::next([&] {
1022 if constexpr (IsLoadChain)
1023 return BasicBlock::reverse_iterator(ChainBegin);
1024 else
1025 return BasicBlock::iterator(ChainBegin);
1026 }());
1027
1028 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1029 const unsigned ChainElemSize =
1030 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1031
1032 for (; BBIt != BBItEnd; ++BBIt) {
1033 Instruction *I = &*BBIt;
1034
1035 if (!I->mayReadOrWriteMemory())
1036 continue;
1037
1038 // Loads can be reordered with other loads.
1039 if (IsLoadChain && isa<LoadInst>(I))
1040 continue;
1041
1042 // Stores can be sunk below invariant loads.
1043 if (!IsLoadChain && isInvariantLoad(I))
1044 continue;
1045
1046 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1047 // what offset ChainIt accesses. This may be better than AA is able to do.
1048 //
1049 // We should really only have duplicate offsets for stores (the duplicate
1050 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1051 // split the chain so we don't have to handle this case specially.
1052 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1053 // I and ChainElem overlap if:
1054 // - I and ChainElem have the same offset, OR
1055 // - I's offset is less than ChainElem's, but I touches past the
1056 // beginning of ChainElem, OR
1057 // - ChainElem's offset is less than I's, but ChainElem touches past the
1058 // beginning of I.
1059 const APInt &IOffset = OffsetIt->second;
1060 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1061 if (IOffset == ChainElemOffset ||
1062 (IOffset.sle(ChainElemOffset) &&
1063 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1064 (ChainElemOffset.sle(IOffset) &&
1065 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1066 LLVM_DEBUG({
1067 // Double check that AA also sees this alias. If not, we probably
1068 // have a bug.
1069 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1070 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1071 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1072 });
1073 return false; // We found an aliasing instruction; bail.
1074 }
1075
1076 continue; // We're confident there's no alias.
1077 }
1078
1079 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1080 ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1081 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1082 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1083 << " Aliasing instruction:\n"
1084 << " " << *I << '\n'
1085 << " Aliased instruction and pointer:\n"
1086 << " " << *ChainElem << '\n'
1087 << " " << *getLoadStorePointerOperand(ChainElem)
1088 << '\n');
1089
1090 return false;
1091 }
1092 }
1093 return true;
1094}
1095
1097 BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1098 return (Signed && BinOpI->hasNoSignedWrap()) ||
1099 (!Signed && BinOpI->hasNoUnsignedWrap());
1100}
1101
1102static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1103 unsigned MatchingOpIdxA, Instruction *AddOpB,
1104 unsigned MatchingOpIdxB, bool Signed) {
1105 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1106 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1107 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1108 << ", MatchingOpIdxB=" << MatchingOpIdxB
1109 << ", Signed=" << Signed << "\n");
1110 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1111 // being the same, we can guarantee that the transformation is safe if we can
1112 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1113 // For example:
1114 // %tmp7 = add nsw i32 %tmp2, %v0
1115 // %tmp8 = sext i32 %tmp7 to i64
1116 // ...
1117 // %tmp11 = add nsw i32 %v0, 1
1118 // %tmp12 = add nsw i32 %tmp2, %tmp11
1119 // %tmp13 = sext i32 %tmp12 to i64
1120 //
1121 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1122 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1123 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1124 assert(AddOpA->getOpcode() == Instruction::Add &&
1125 AddOpB->getOpcode() == Instruction::Add &&
1126 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1127 if (AddOpA->getOperand(MatchingOpIdxA) ==
1128 AddOpB->getOperand(MatchingOpIdxB)) {
1129 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1130 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1131 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1132 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1133 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1134 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1135 checkNoWrapFlags(OtherInstrB, Signed) &&
1136 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1137 int64_t CstVal =
1138 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1139 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1140 IdxDiff.getSExtValue() == CstVal)
1141 return true;
1142 }
1143 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1144 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1145 checkNoWrapFlags(OtherInstrA, Signed) &&
1146 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1147 int64_t CstVal =
1148 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1149 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1150 IdxDiff.getSExtValue() == -CstVal)
1151 return true;
1152 }
1153 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1154 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1155 if (OtherInstrA && OtherInstrB &&
1156 OtherInstrA->getOpcode() == Instruction::Add &&
1157 OtherInstrB->getOpcode() == Instruction::Add &&
1158 checkNoWrapFlags(OtherInstrA, Signed) &&
1159 checkNoWrapFlags(OtherInstrB, Signed) &&
1160 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1161 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1162 int64_t CstValA =
1163 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1164 int64_t CstValB =
1165 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1166 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1167 IdxDiff.getSExtValue() == (CstValB - CstValA))
1168 return true;
1169 }
1170 }
1171 return false;
1172}
1173
1174std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1175 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1176 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1177 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1178 << " Depth=" << Depth << "\n");
1179 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1180 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1181 if (!GEPA || !GEPB)
1182 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1183
1184 // Look through GEPs after checking they're the same except for the last
1185 // index.
1186 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1187 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1188 return std::nullopt;
1189 gep_type_iterator GTIA = gep_type_begin(GEPA);
1190 gep_type_iterator GTIB = gep_type_begin(GEPB);
1191 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1192 if (GTIA.getOperand() != GTIB.getOperand())
1193 return std::nullopt;
1194 ++GTIA;
1195 ++GTIB;
1196 }
1197
1198 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1199 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1200 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1201 OpA->getType() != OpB->getType())
1202 return std::nullopt;
1203
1204 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1205
1206 // Only look through a ZExt/SExt.
1207 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1208 return std::nullopt;
1209
1210 bool Signed = isa<SExtInst>(OpA);
1211
1212 // At this point A could be a function parameter, i.e. not an instruction
1213 Value *ValA = OpA->getOperand(0);
1214 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1215 if (!OpB || ValA->getType() != OpB->getType())
1216 return std::nullopt;
1217
1218 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1219 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1220 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1221 if (IdxDiffSCEV == SE.getCouldNotCompute())
1222 return std::nullopt;
1223
1224 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1225 if (!IdxDiffRange.isSingleElement())
1226 return std::nullopt;
1227 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1228
1229 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1230 << "\n");
1231
1232 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1233 bool Safe = false;
1234
1235 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1236 // ValA, we're okay.
1237 if (OpB->getOpcode() == Instruction::Add &&
1238 isa<ConstantInt>(OpB->getOperand(1)) &&
1239 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1241 Safe = true;
1242
1243 // Second attempt: check if we have eligible add NSW/NUW instruction
1244 // sequences.
1245 OpA = dyn_cast<Instruction>(ValA);
1246 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1247 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1248 checkNoWrapFlags(OpB, Signed)) {
1249 // In the checks below a matching operand in OpA and OpB is an operand which
1250 // is the same in those two instructions. Below we account for possible
1251 // orders of the operands of these add instructions.
1252 for (unsigned MatchingOpIdxA : {0, 1})
1253 for (unsigned MatchingOpIdxB : {0, 1})
1254 if (!Safe)
1255 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1256 MatchingOpIdxB, Signed);
1257 }
1258
1259 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1260
1261 // Third attempt:
1262 //
1263 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1264 // order bit other than the sign bit are known to be zero in ValA, we can add
1265 // Diff to it while guaranteeing no overflow of any sort.
1266 //
1267 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1268 if (!Safe) {
1269 // When computing known bits, use the GEPs as context instructions, since
1270 // they likely are in the same BB as the load/store.
1271 KnownBits Known(BitWidth);
1272 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1273 ContextInst, &DT);
1274 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1275 if (Signed)
1276 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1277 if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1278 return std::nullopt;
1279 Safe = true;
1280 }
1281
1282 if (Safe)
1283 return IdxDiff * Stride;
1284 return std::nullopt;
1285}
1286
1287std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1288 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1289 if (Depth++ == MaxDepth)
1290 return std::nullopt;
1291
1292 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1293 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1294 if (SelectA->getCondition() != SelectB->getCondition())
1295 return std::nullopt;
1296 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1297 << ", PtrB=" << *PtrB << ", ContextInst="
1298 << *ContextInst << ", Depth=" << Depth << "\n");
1299 std::optional<APInt> TrueDiff = getConstantOffset(
1300 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1301 if (!TrueDiff)
1302 return std::nullopt;
1303 std::optional<APInt> FalseDiff =
1304 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1305 ContextInst, Depth);
1306 if (TrueDiff == FalseDiff)
1307 return TrueDiff;
1308 }
1309 }
1310 return std::nullopt;
1311}
1312
1313void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1314 if (EQClasses.size() < 2) // There is nothing to merge.
1315 return;
1316
1317 // The reduced key has all elements of the ECClassKey except the underlying
1318 // object. Check that EqClassKey has 4 elements and define the reduced key.
1319 static_assert(std::tuple_size_v<EqClassKey> == 4,
1320 "EqClassKey has changed - EqClassReducedKey needs changes too");
1321 using EqClassReducedKey =
1322 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1323 std::tuple_element_t<2, EqClassKey> /* Element size */,
1324 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1325 using ECReducedKeyToUnderlyingObjectMap =
1326 MapVector<EqClassReducedKey,
1328
1329 // Form a map from the reduced key (without the underlying object) to the
1330 // underlying objects: 1 reduced key to many underlying objects, to form
1331 // groups of potentially merge-able equivalence classes.
1332 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1333 bool FoundPotentiallyOptimizableEC = false;
1334 for (const auto &EC : EQClasses) {
1335 const auto &Key = EC.first;
1336 EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1337 std::get<3>(Key)};
1338 RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
1339 if (RedKeyToUOMap[RedKey].size() > 1)
1340 FoundPotentiallyOptimizableEC = true;
1341 }
1342 if (!FoundPotentiallyOptimizableEC)
1343 return;
1344
1345 LLVM_DEBUG({
1346 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1347 for (const auto &EC : EQClasses) {
1348 dbgs() << " Key: {" << EC.first << "}\n";
1349 for (const auto &Inst : EC.second)
1350 dbgs() << " Inst: " << *Inst << '\n';
1351 }
1352 });
1353 LLVM_DEBUG({
1354 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1355 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1356 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1357 << std::get<1>(RedKeyToUO.first) << ", "
1358 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1359 << RedKeyToUO.second.size() << " underlying objects:\n";
1360 for (auto UObject : RedKeyToUO.second)
1361 dbgs() << " " << *UObject << '\n';
1362 }
1363 });
1364
1365 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1366
1367 // Compute the ultimate targets for a set of underlying objects.
1368 auto GetUltimateTargets =
1369 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1370 UObjectToUObjectMap IndirectionMap;
1371 for (const auto *UObject : UObjects) {
1372 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1373 const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
1374 if (UltimateTarget != UObject)
1375 IndirectionMap[UObject] = UltimateTarget;
1376 }
1377 UObjectToUObjectMap UltimateTargetsMap;
1378 for (const auto *UObject : UObjects) {
1379 auto Target = UObject;
1380 auto It = IndirectionMap.find(Target);
1381 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
1382 Target = It->second;
1383 UltimateTargetsMap[UObject] = Target;
1384 }
1385 return UltimateTargetsMap;
1386 };
1387
1388 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1389 // try to merge the equivalence classes.
1390 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1391 if (UObjects.size() < 2)
1392 continue;
1393 auto UTMap = GetUltimateTargets(UObjects);
1394 for (const auto &[UObject, UltimateTarget] : UTMap) {
1395 if (UObject == UltimateTarget)
1396 continue;
1397
1398 EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1399 std::get<2>(RedKey)};
1400 EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1401 std::get<2>(RedKey)};
1402 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1403 // request the reference to the instructions vector for KeyTo first.
1404 const auto &VecTo = EQClasses[KeyTo];
1405 const auto &VecFrom = EQClasses[KeyFrom];
1407 std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1408 std::back_inserter(MergedVec),
1409 [](Instruction *A, Instruction *B) {
1410 return A && B && A->comesBefore(B);
1411 });
1412 EQClasses[KeyTo] = std::move(MergedVec);
1413 EQClasses.erase(KeyFrom);
1414 }
1415 }
1416 LLVM_DEBUG({
1417 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1418 for (const auto &EC : EQClasses) {
1419 dbgs() << " Key: {" << EC.first << "}\n";
1420 for (const auto &Inst : EC.second)
1421 dbgs() << " Inst: " << *Inst << '\n';
1422 }
1423 });
1424}
1425
1426EquivalenceClassMap
1427Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1429 EquivalenceClassMap Ret;
1430
1431 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1432 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1433 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1434 // The select's themselves are distinct instructions even if they share
1435 // the same condition and evaluate to consecutive pointers for true and
1436 // false values of the condition. Therefore using the select's themselves
1437 // for grouping instructions would put consecutive accesses into different
1438 // lists and they won't be even checked for being consecutive, and won't
1439 // be vectorized.
1440 return Sel->getCondition();
1441 }
1442 return ObjPtr;
1443 };
1444
1445 for (Instruction &I : make_range(Begin, End)) {
1446 auto *LI = dyn_cast<LoadInst>(&I);
1447 auto *SI = dyn_cast<StoreInst>(&I);
1448 if (!LI && !SI)
1449 continue;
1450
1451 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1452 continue;
1453
1454 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1455 (SI && !TTI.isLegalToVectorizeStore(SI)))
1456 continue;
1457
1458 Type *Ty = getLoadStoreType(&I);
1460 continue;
1461
1462 // Skip weird non-byte sizes. They probably aren't worth the effort of
1463 // handling correctly.
1464 unsigned TySize = DL.getTypeSizeInBits(Ty);
1465 if ((TySize % 8) != 0)
1466 continue;
1467
1468 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1469 // functions are currently using an integer type for the vectorized
1470 // load/store, and does not support casting between the integer type and a
1471 // vector of pointers (e.g. i64 to <2 x i16*>)
1472 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1473 continue;
1474
1476 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1477 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1478
1479 unsigned VF = VecRegSize / TySize;
1480 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1481
1482 // Only handle power-of-two sized elements.
1483 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1484 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1485 continue;
1486
1487 // No point in looking at these if they're too big to vectorize.
1488 if (TySize > VecRegSize / 2 ||
1489 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1490 continue;
1491
1492 Ret[{GetUnderlyingObject(Ptr), AS,
1493 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1494 /*IsLoad=*/LI != nullptr}]
1495 .emplace_back(&I);
1496 }
1497
1498 mergeEquivalenceClasses(Ret);
1499 return Ret;
1500}
1501
1502std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1503 if (Instrs.empty())
1504 return {};
1505
1506 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1507 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1508
1509#ifndef NDEBUG
1510 // Check that Instrs is in BB order and all have the same addr space.
1511 for (size_t I = 1; I < Instrs.size(); ++I) {
1512 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1513 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1514 }
1515#endif
1516
1517 // Machinery to build an MRU-hashtable of Chains.
1518 //
1519 // (Ideally this could be done with MapVector, but as currently implemented,
1520 // moving an element to the front of a MapVector is O(n).)
1521 struct InstrListElem : ilist_node<InstrListElem>,
1522 std::pair<Instruction *, Chain> {
1523 explicit InstrListElem(Instruction *I)
1524 : std::pair<Instruction *, Chain>(I, {}) {}
1525 };
1526 struct InstrListElemDenseMapInfo {
1527 using PtrInfo = DenseMapInfo<InstrListElem *>;
1528 using IInfo = DenseMapInfo<Instruction *>;
1529 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1530 static InstrListElem *getTombstoneKey() {
1531 return PtrInfo::getTombstoneKey();
1532 }
1533 static unsigned getHashValue(const InstrListElem *E) {
1534 return IInfo::getHashValue(E->first);
1535 }
1536 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1537 if (A == getEmptyKey() || B == getEmptyKey())
1538 return A == getEmptyKey() && B == getEmptyKey();
1539 if (A == getTombstoneKey() || B == getTombstoneKey())
1540 return A == getTombstoneKey() && B == getTombstoneKey();
1541 return IInfo::isEqual(A->first, B->first);
1542 }
1543 };
1547
1548 // Compare each instruction in `instrs` to leader of the N most recently-used
1549 // chains. This limits the O(n^2) behavior of this pass while also allowing
1550 // us to build arbitrarily long chains.
1551 for (Instruction *I : Instrs) {
1552 constexpr int MaxChainsToTry = 64;
1553
1554 bool MatchFound = false;
1555 auto ChainIter = MRU.begin();
1556 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1557 ++J, ++ChainIter) {
1558 if (std::optional<APInt> Offset = getConstantOffset(
1559 getLoadStorePointerOperand(ChainIter->first),
1561 /*ContextInst=*/
1562 (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1563 // `Offset` might not have the expected number of bits, if e.g. AS has a
1564 // different number of bits than opaque pointers.
1565 ChainIter->second.emplace_back(I, Offset.value());
1566 // Move ChainIter to the front of the MRU list.
1567 MRU.remove(*ChainIter);
1568 MRU.push_front(*ChainIter);
1569 MatchFound = true;
1570 break;
1571 }
1572 }
1573
1574 if (!MatchFound) {
1575 APInt ZeroOffset(ASPtrBits, 0);
1576 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1577 E->second.emplace_back(I, ZeroOffset);
1578 MRU.push_front(*E);
1579 Chains.insert(E);
1580 }
1581 }
1582
1583 std::vector<Chain> Ret;
1584 Ret.reserve(Chains.size());
1585 // Iterate over MRU rather than Chains so the order is deterministic.
1586 for (auto &E : MRU)
1587 if (E.second.size() > 1)
1588 Ret.emplace_back(std::move(E.second));
1589 return Ret;
1590}
1591
1592std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1593 Instruction *ContextInst,
1594 unsigned Depth) {
1595 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1596 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1597 << ", Depth=" << Depth << "\n");
1598 // We'll ultimately return a value of this bit width, even if computations
1599 // happen in a different width.
1600 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1601 APInt OffsetA(OrigBitWidth, 0);
1602 APInt OffsetB(OrigBitWidth, 0);
1603 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1604 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1605 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1606 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1607 return std::nullopt;
1608
1609 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1610 // should properly handle a possible overflow and the value should fit into
1611 // the smallest data type used in the cast/gep chain.
1612 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1613 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1614
1615 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1616 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1617 if (PtrA == PtrB)
1618 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1619
1620 // Try to compute B - A.
1621 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1622 if (DistScev != SE.getCouldNotCompute()) {
1623 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1624 ConstantRange DistRange = SE.getSignedRange(DistScev);
1625 if (DistRange.isSingleElement()) {
1626 // Handle index width (the width of Dist) != pointer width (the width of
1627 // the Offset*s at this point).
1628 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1629 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1630 }
1631 }
1632 if (std::optional<APInt> Diff =
1633 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1634 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1635 .sextOrTrunc(OrigBitWidth);
1636 return std::nullopt;
1637}
AMDGPU Mark last scratch load
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Module.h This file contains the declarations for the Module class.
static const unsigned MaxDepth
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To)
Check if it's safe to move From down to To, checking that no physical registers are clobbered.
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
This pass exposes codegen information to IR-level passes.
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:78
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
APInt abs() const
Get the absolute value.
Definition: APInt.h:1773
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1111
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1166
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1015
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition: APInt.h:475
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1237
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents a range of values.
Definition: ConstantRange.h:47
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2704
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:368
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
An instruction for reading from memory.
Definition: Instructions.h:176
bool isSimple() const
Definition: Instructions.h:247
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getCouldNotCompute()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:389
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalToVectorizeLoad(LoadInst *LI) const
bool isLegalToVectorizeStore(StoreInst *SI) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Target - Wrapper for Target specific information.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:740
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:694
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
size_type size() const
Definition: DenseSet.h:81
TypeSize getSequentialElementStride(const DataLayout &DL) const
const ParentTy * getParent() const
Definition: ilist_node.h:32
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A simple intrusive list implementation.
Definition: simple_ilist.h:81
void push_front(reference Node)
Insert a node at the front; never copies.
Definition: simple_ilist.h:150
void remove(reference N)
Remove a node by reference; never deletes.
Definition: simple_ilist.h:189
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2004
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< po_iterator< T > > post_order(const T &G)
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1581
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:48
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
TargetTransformInfo TTI
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:303
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2099
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:52
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117