LLVM 23.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
191 for (const ChainElem &E : C)
192 Values.emplace_back(E.Inst);
193 return propagateMetadata(I, Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
206
207 Worklist.emplace_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(I)) {
224 InstructionsToMove.insert(IM);
225 Worklist.emplace_back(IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(IM))
234 continue;
235 IM->moveBefore(I->getIterator());
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
245 TargetTransformInfo &TTI;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 /// We could erase instrs right after vectorizing them, but that can mess up
250 /// our BB iterators, and also can make the equivalence class keys point to
251 /// freed memory. This is fixable, but it's simpler just to wait until we're
252 /// done with the BB and erase all at once.
254
255 /// We insert load/store instructions and GEPs to fill gaps and extend chains
256 /// to enable vectorization. Keep track and delete them later.
257 DenseSet<Instruction *> ExtraElements;
258
259public:
260 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
261 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
262 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
263 DL(F.getDataLayout()), Builder(SE.getContext()) {}
264
265 bool run();
266
267private:
268 static const unsigned MaxDepth = 3;
269
270 /// Runs the vectorizer on a "pseudo basic block", which is a range of
271 /// instructions [Begin, End) within one BB all of which have
272 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
273 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
274
275 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
276 /// in the same BB with the same value for getUnderlyingObject() etc.
277 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
279
280 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
281 /// where all instructions access a known, constant offset from the first
282 /// instruction.
283 bool runOnChain(Chain &C);
284
285 /// Splits the chain into subchains of instructions which read/write a
286 /// contiguous block of memory. Discards any length-1 subchains (because
287 /// there's nothing to vectorize in there). Also attempts to fill gaps with
288 /// "extra" elements to artificially make chains contiguous in some cases.
289 std::vector<Chain> splitChainByContiguity(Chain &C);
290
291 /// Splits the chain into subchains where it's safe to hoist loads up to the
292 /// beginning of the sub-chain and it's safe to sink loads up to the end of
293 /// the sub-chain. Discards any length-1 subchains. Also attempts to extend
294 /// non-power-of-two chains by adding "extra" elements in some cases.
295 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
296
297 /// Splits the chain into subchains that make legal, aligned accesses.
298 /// Discards any length-1 subchains.
299 std::vector<Chain> splitChainByAlignment(Chain &C);
300
301 /// Converts the instrs in the chain into a single vectorized load or store.
302 /// Adds the old scalar loads/stores to ToErase.
303 bool vectorizeChain(Chain &C);
304
305 /// Tries to compute the offset in bytes PtrB - PtrA.
306 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth = 0);
309 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
310 Instruction *ContextInst,
311 unsigned Depth);
312 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
313 Instruction *ContextInst,
314 unsigned Depth);
315
316 /// Gets the element type of the vector that the chain will load or store.
317 /// This is nontrivial because the chain may contain elements of different
318 /// types; e.g. it's legal to have a chain that contains both i32 and float.
319 Type *getChainElemTy(const Chain &C);
320
321 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
322 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
323 /// instructions.
324 ///
325 /// The map ChainElemOffsets must contain all of the elements in
326 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
327 /// address. It's ok if it contains additional entries.
328 template <bool IsLoadChain>
329 bool isSafeToMove(
330 Instruction *ChainElem, Instruction *ChainBegin,
331 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
332 BatchAAResults &BatchAA);
333
334 /// Merges the equivalence classes if they have underlying objects that differ
335 /// by one level of indirection (i.e., one is a getelementptr and the other is
336 /// the base pointer in that getelementptr).
337 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
338
339 /// Collects loads and stores grouped by "equivalence class", where:
340 /// - all elements in an eq class are a load or all are a store,
341 /// - they all load/store the same element size (it's OK to have e.g. i8 and
342 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
343 /// - they all have the same value for getUnderlyingObject().
344 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
346
347 /// Partitions Instrs into "chains" where every instruction has a known
348 /// constant offset from the first instr in the chain.
349 ///
350 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
351 /// in the chain is the leader, and an instr touches distance 0 from itself.
352 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
353
354 /// Checks if a potential vector load/store with a given alignment is allowed
355 /// and fast. Aligned accesses are always allowed and fast, while misaligned
356 /// accesses depend on TTI checks to determine whether they can and should be
357 /// vectorized or kept as element-wise accesses.
358 bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment,
359 unsigned VecElemBits) const;
360
361 /// Create a new GEP and a new Load/Store instruction such that the GEP
362 /// is pointing at PrevElem + Offset. In the case of stores, store poison.
363 /// Extra elements will either be combined into a masked load/store or
364 /// deleted before the end of the pass.
365 ChainElem createExtraElementAfter(const ChainElem &PrevElem, Type *Ty,
366 APInt Offset, StringRef Prefix,
367 Align Alignment = Align());
368
369 /// Create a mask that masks off the extra elements in the chain, to be used
370 /// for the creation of a masked load/store vector.
371 Value *createMaskForExtraElements(const ArrayRef<ChainElem> C,
372 FixedVectorType *VecTy);
373
374 /// Delete dead GEPs and extra Load/Store instructions created by
375 /// createExtraElementAfter
376 void deleteExtraElements();
377};
378
379class LoadStoreVectorizerLegacyPass : public FunctionPass {
380public:
381 static char ID;
382
383 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {}
384
385 bool runOnFunction(Function &F) override;
386
387 StringRef getPassName() const override {
388 return "GPU Load and Store Vectorizer";
389 }
390
391 void getAnalysisUsage(AnalysisUsage &AU) const override {
392 AU.addRequired<AAResultsWrapperPass>();
393 AU.addRequired<AssumptionCacheTracker>();
394 AU.addRequired<ScalarEvolutionWrapperPass>();
395 AU.addRequired<DominatorTreeWrapperPass>();
396 AU.addRequired<TargetTransformInfoWrapperPass>();
397 AU.setPreservesCFG();
398 }
399};
400
401} // end anonymous namespace
402
403char LoadStoreVectorizerLegacyPass::ID = 0;
404
405INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
406 "Vectorize load and Store instructions", false, false)
413INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
414 "Vectorize load and store instructions", false, false)
415
417 return new LoadStoreVectorizerLegacyPass();
418}
419
420bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
421 // Don't vectorize when the attribute NoImplicitFloat is used.
422 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
423 return false;
424
425 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
426 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
427 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
428 TargetTransformInfo &TTI =
429 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
430
431 AssumptionCache &AC =
432 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
433
434 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
435}
436
439 // Don't vectorize when the attribute NoImplicitFloat is used.
440 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
441 return PreservedAnalyses::all();
442
448
449 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
452 return Changed ? PA : PreservedAnalyses::all();
453}
454
455bool Vectorizer::run() {
456 bool Changed = false;
457 // Break up the BB if there are any instrs which aren't guaranteed to transfer
458 // execution to their successor.
459 //
460 // Consider, for example:
461 //
462 // def assert_arr_len(int n) { if (n < 2) exit(); }
463 //
464 // load arr[0]
465 // call assert_array_len(arr.length)
466 // load arr[1]
467 //
468 // Even though assert_arr_len does not read or write any memory, we can't
469 // speculate the second load before the call. More info at
470 // https://github.com/llvm/llvm-project/issues/52950.
471 for (BasicBlock *BB : post_order(&F)) {
472 // BB must at least have a terminator.
473 assert(!BB->empty());
474
476 Barriers.emplace_back(BB->begin());
477 for (Instruction &I : *BB)
479 Barriers.emplace_back(I.getIterator());
480 Barriers.emplace_back(BB->end());
481
482 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
483 ++It)
484 Changed |= runOnPseudoBB(*It, *std::next(It));
485
486 for (Instruction *I : ToErase) {
487 // These will get deleted in deleteExtraElements.
488 // This is because ExtraElements will include both extra elements
489 // that *were* vectorized and extra elements that *were not*
490 // vectorized. ToErase will only include extra elements that *were*
491 // vectorized, so in order to avoid double deletion we skip them here and
492 // handle them in deleteExtraElements.
493 if (ExtraElements.contains(I))
494 continue;
495 auto *PtrOperand = getLoadStorePointerOperand(I);
496 if (I->use_empty())
497 I->eraseFromParent();
499 }
500 ToErase.clear();
501 deleteExtraElements();
502 }
503
504 return Changed;
505}
506
507bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
509 LLVM_DEBUG({
510 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
511 if (End != Begin->getParent()->end())
512 dbgs() << *End;
513 else
514 dbgs() << "<BB end>";
515 dbgs() << ")\n";
516 });
517
518 bool Changed = false;
519 for (const auto &[EqClassKey, EqClass] :
520 collectEquivalenceClasses(Begin, End))
521 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
522
523 return Changed;
524}
525
526bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
527 ArrayRef<Instruction *> EqClass) {
528 bool Changed = false;
529
530 LLVM_DEBUG({
531 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
532 << " keyed on " << EqClassKey << ":\n";
533 for (Instruction *I : EqClass)
534 dbgs() << " " << *I << "\n";
535 });
536
537 std::vector<Chain> Chains = gatherChains(EqClass);
538 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
539 << " nontrivial chains.\n";);
540 for (Chain &C : Chains)
541 Changed |= runOnChain(C);
542 return Changed;
543}
544
545bool Vectorizer::runOnChain(Chain &C) {
546 LLVM_DEBUG({
547 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
548 dumpChain(C);
549 });
550
551 // Split up the chain into increasingly smaller chains, until we can finally
552 // vectorize the chains.
553 //
554 // (Don't be scared by the depth of the loop nest here. These operations are
555 // all at worst O(n lg n) in the number of instructions, and splitting chains
556 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
557 bool Changed = false;
558 for (auto &C : splitChainByMayAliasInstrs(C))
559 for (auto &C : splitChainByContiguity(C))
560 for (auto &C : splitChainByAlignment(C))
561 Changed |= vectorizeChain(C);
562 return Changed;
563}
564
565std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
566 if (C.empty())
567 return {};
568
569 sortChainInBBOrder(C);
570
571 LLVM_DEBUG({
572 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
573 dumpChain(C);
574 });
575
576 // We know that elements in the chain with nonverlapping offsets can't
577 // alias, but AA may not be smart enough to figure this out. Use a
578 // hashtable.
579 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
580 for (const auto &E : C)
581 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
582
583 // Across a single invocation of this function the IR is not changing, so
584 // using a batched Alias Analysis is safe and can reduce compile time.
585 BatchAAResults BatchAA(AA);
586
587 // Loads get hoisted up to the first load in the chain. Stores get sunk
588 // down to the last store in the chain. Our algorithm for loads is:
589 //
590 // - Take the first element of the chain. This is the start of a new chain.
591 // - Take the next element of `Chain` and check for may-alias instructions
592 // up to the start of NewChain. If no may-alias instrs, add it to
593 // NewChain. Otherwise, start a new NewChain.
594 //
595 // For stores it's the same except in the reverse direction.
596 //
597 // We expect IsLoad to be an std::bool_constant.
598 auto Impl = [&](auto IsLoad) {
599 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
600 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
601 if constexpr (IsLoad())
602 return std::make_pair(C.begin(), C.end());
603 else
604 return std::make_pair(C.rbegin(), C.rend());
605 }(IsLoad);
606 assert(ChainBegin != ChainEnd);
607
608 std::vector<Chain> Chains;
610 NewChain.emplace_back(*ChainBegin);
611 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
612 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
613 ChainOffsets, BatchAA)) {
614 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
615 << *ChainIt->Inst << " into " << *ChainBegin->Inst
616 << "\n");
617 NewChain.emplace_back(*ChainIt);
618 } else {
620 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
621 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
622 if (NewChain.size() > 1) {
623 LLVM_DEBUG({
624 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
625 dumpChain(NewChain);
626 });
627 Chains.emplace_back(std::move(NewChain));
628 }
629
630 // Start a new chain.
631 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
632 }
633 }
634 if (NewChain.size() > 1) {
635 LLVM_DEBUG({
636 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
637 dumpChain(NewChain);
638 });
639 Chains.emplace_back(std::move(NewChain));
640 }
641 return Chains;
642 };
643
644 if (isa<LoadInst>(C[0].Inst))
645 return Impl(/*IsLoad=*/std::bool_constant<true>());
646
647 assert(isa<StoreInst>(C[0].Inst));
648 return Impl(/*IsLoad=*/std::bool_constant<false>());
649}
650
651std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
652 if (C.empty())
653 return {};
654
655 sortChainInOffsetOrder(C);
656
657 LLVM_DEBUG({
658 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
659 dumpChain(C);
660 });
661
662 // If the chain is not contiguous, we try to fill the gap with "extra"
663 // elements to artificially make it contiguous, to try to enable
664 // vectorization. We only fill gaps if there is potential to end up with a
665 // legal masked load/store given the target, address space, and element type.
666 // At this point, when querying the TTI, optimistically assume max alignment
667 // and max vector size, as splitChainByAlignment will ensure the final vector
668 // shape passes the legalization check.
669 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
671 unsigned MaxVecRegBits = TTI.getLoadStoreVecRegBitWidth(AS);
672 Align OptimisticAlign = Align(MaxVecRegBits / 8);
673 unsigned int MaxVectorNumElems =
674 MaxVecRegBits / DL.getTypeSizeInBits(ElementType);
675 // Note: This check decides whether to try to fill gaps based on the masked
676 // legality of the target's maximum vector size (getLoadStoreVecRegBitWidth).
677 // If a target *does not* support a masked load/store with this max vector
678 // size, but *does* support a masked load/store with a *smaller* vector size,
679 // that optimization will be missed. This does not occur in any of the targets
680 // that currently support this API.
681 FixedVectorType *OptimisticVectorType =
682 FixedVectorType::get(ElementType, MaxVectorNumElems);
683 bool TryFillGaps =
684 isa<LoadInst>(C[0].Inst)
685 ? TTI.isLegalMaskedLoad(OptimisticVectorType, OptimisticAlign, AS,
687 : TTI.isLegalMaskedStore(OptimisticVectorType, OptimisticAlign, AS,
689
690 // Cache the best aligned element in the chain for use when creating extra
691 // elements.
692 Align BestAlignedElemAlign = getLoadStoreAlignment(C[0].Inst);
693 APInt OffsetOfBestAlignedElemFromLeader = C[0].OffsetFromLeader;
694 for (const auto &E : C) {
695 Align ElementAlignment = getLoadStoreAlignment(E.Inst);
696 if (ElementAlignment > BestAlignedElemAlign) {
697 BestAlignedElemAlign = ElementAlignment;
698 OffsetOfBestAlignedElemFromLeader = E.OffsetFromLeader;
699 }
700 }
701
702 auto DeriveAlignFromBestAlignedElem = [&](APInt NewElemOffsetFromLeader) {
703 return commonAlignment(
704 BestAlignedElemAlign,
705 (NewElemOffsetFromLeader - OffsetOfBestAlignedElemFromLeader)
706 .abs()
707 .getLimitedValue());
708 };
709
710 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
711
712 std::vector<Chain> Ret;
713 Ret.push_back({C.front()});
714
715 unsigned ChainElemTyBits = DL.getTypeSizeInBits(getChainElemTy(C));
716 ChainElem &Prev = C[0];
717 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
718 auto &CurChain = Ret.back();
719
720 APInt PrevSzBytes =
721 APInt(ASPtrBits, DL.getTypeStoreSize(getLoadStoreType(Prev.Inst)));
722 APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes;
723 unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(It->Inst));
724
725 // Add this instruction to the end of the current chain, or start a new one.
726 assert(
727 8 * SzBytes % ChainElemTyBits == 0 &&
728 "Every chain-element size must be a multiple of the element size after "
729 "vectorization.");
730 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
731 // Allow redundancy: partial or full overlap counts as contiguous.
732 bool AreContiguous = false;
733 if (It->OffsetFromLeader.sle(PrevReadEnd)) {
734 // Check overlap is a multiple of the element size after vectorization.
735 uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader).getZExtValue();
736 if (8 * Overlap % ChainElemTyBits == 0)
737 AreContiguous = true;
738 }
739
740 LLVM_DEBUG(dbgs() << "LSV: Instruction is "
741 << (AreContiguous ? "contiguous" : "chain-breaker")
742 << *It->Inst << " (starts at offset "
743 << It->OffsetFromLeader << ")\n");
744
745 // If the chain is not contiguous, try to fill in gaps between Prev and
746 // Curr. For now, we aren't filling gaps between load/stores of different
747 // sizes. Additionally, as a conservative heuristic, we only fill gaps of
748 // 1-2 elements. Generating loads/stores with too many unused bytes has a
749 // side effect of increasing register pressure (on NVIDIA targets at least),
750 // which could cancel out the benefits of reducing number of load/stores.
751 bool GapFilled = false;
752 if (!AreContiguous && TryFillGaps && PrevSzBytes == SzBytes) {
753 APInt GapSzBytes = It->OffsetFromLeader - PrevReadEnd;
754 if (GapSzBytes == PrevSzBytes) {
755 // There is a single gap between Prev and Curr, create one extra element
756 ChainElem NewElem = createExtraElementAfter(
757 Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
758 DeriveAlignFromBestAlignedElem(PrevReadEnd));
759 CurChain.push_back(NewElem);
760 GapFilled = true;
761 }
762 // There are two gaps between Prev and Curr, only create two extra
763 // elements if Prev is the first element in a sequence of four.
764 // This has the highest chance of resulting in a beneficial vectorization.
765 if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) {
766 ChainElem NewElem1 = createExtraElementAfter(
767 Prev, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
768 DeriveAlignFromBestAlignedElem(PrevReadEnd));
769 ChainElem NewElem2 = createExtraElementAfter(
770 NewElem1, getLoadStoreType(Prev.Inst), PrevSzBytes, "GapFill",
771 DeriveAlignFromBestAlignedElem(PrevReadEnd + PrevSzBytes));
772 CurChain.push_back(NewElem1);
773 CurChain.push_back(NewElem2);
774 GapFilled = true;
775 }
776 }
777
778 if (AreContiguous || GapFilled)
779 CurChain.push_back(*It);
780 else
781 Ret.push_back({*It});
782 // In certain cases when handling redundant elements with partial overlaps,
783 // the previous element may still extend beyond the current element. Only
784 // update Prev if the current element is the new end of the chain.
785 if (ReadEnd.sge(PrevReadEnd))
786 Prev = *It;
787 }
788
789 // Filter out length-1 chains, these are uninteresting.
790 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
791 return Ret;
792}
793
794Type *Vectorizer::getChainElemTy(const Chain &C) {
795 assert(!C.empty());
796 // The rules are:
797 // - If there are any pointer types in the chain, use an integer type.
798 // - Prefer an integer type if it appears in the chain.
799 // - Otherwise, use the first type in the chain.
800 //
801 // The rule about pointer types is a simplification when we merge e.g. a load
802 // of a ptr and a double. There's no direct conversion from a ptr to a
803 // double; it requires a ptrtoint followed by a bitcast.
804 //
805 // It's unclear to me if the other rules have any practical effect, but we do
806 // it to match this pass's previous behavior.
807 if (any_of(C, [](const ChainElem &E) {
808 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
809 })) {
810 return Type::getIntNTy(
811 F.getContext(),
812 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
813 }
814
815 for (const ChainElem &E : C)
816 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
817 return T;
818 return getLoadStoreType(C[0].Inst)->getScalarType();
819}
820
821std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
822 // We use a simple greedy algorithm.
823 // - Given a chain of length N, find all prefixes that
824 // (a) are not longer than the max register length, and
825 // (b) are a power of 2.
826 // - Starting from the longest prefix, try to create a vector of that length.
827 // - If one of them works, great. Repeat the algorithm on any remaining
828 // elements in the chain.
829 // - If none of them work, discard the first element and repeat on a chain
830 // of length N-1.
831 if (C.empty())
832 return {};
833
834 sortChainInOffsetOrder(C);
835
836 LLVM_DEBUG({
837 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
838 dumpChain(C);
839 });
840
841 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
842 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
843 unsigned ChainSizeBytes, VectorType *VecTy) {
844 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
845 ChainSizeBytes, VecTy)
846 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
847 ChainSizeBytes, VecTy);
848 };
849
850#ifndef NDEBUG
851 for (const auto &E : C) {
852 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
853 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
854 "Should have filtered out non-power-of-two elements in "
855 "collectEquivalenceClasses.");
856 }
857#endif
858
859 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
860 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
861
862 // For compile time reasons, we cache whether or not the superset
863 // of all candidate chains contains any extra loads/stores from earlier gap
864 // filling.
865 bool CandidateChainsMayContainExtraLoadsStores = any_of(
866 C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); });
867
868 std::vector<Chain> Ret;
869 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
870 // Find candidate chains of size not greater than the largest vector reg.
871 // These chains are over the closed interval [CBegin, CEnd].
872 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
873 CandidateChains;
874 // Need to compute the size of every candidate chain from its beginning
875 // because of possible overlapping among chain elements.
876 unsigned Sz = DL.getTypeStoreSize(getLoadStoreType(C[CBegin].Inst));
877 APInt PrevReadEnd = C[CBegin].OffsetFromLeader + Sz;
878 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
879 APInt ReadEnd = C[CEnd].OffsetFromLeader +
880 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst));
881 unsigned BytesAdded =
882 PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
883 Sz += BytesAdded;
884 if (Sz > VecRegBytes)
885 break;
886 CandidateChains.emplace_back(CEnd, Sz);
887 PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
888 }
889
890 // Consider the longest chain first.
891 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
892 It != End; ++It) {
893 auto [CEnd, SizeBytes] = *It;
895 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
896 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
897
898 Type *VecElemTy = getChainElemTy(C);
899 // Note, VecElemTy is a power of 2, but might be less than one byte. For
900 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
901 // VecElemTy would be i4.
902 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
903
904 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
905 assert((8 * SizeBytes) % VecElemBits == 0);
906 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
907 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
908 unsigned VF = 8 * VecRegBytes / VecElemBits;
909
910 // Check that TTI is happy with this vectorization factor.
911 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
912 VecElemBits * NumVecElems / 8, VecTy);
913 if (TargetVF != VF && TargetVF < NumVecElems) {
915 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
916 "because TargetVF="
917 << TargetVF << " != VF=" << VF
918 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
919 continue;
920 }
921
922 // If we're loading/storing from an alloca, align it if possible.
923 //
924 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
925 // tells us this is beneficial. This feels a bit odd, but it matches
926 // existing tests. This isn't *so* bad, because at most we align to 4
927 // bytes (current value of StackAdjustedAlignment).
928 //
929 // FIXME: We will upgrade the alignment of the alloca even if it turns out
930 // we can't vectorize for some other reason.
931 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
932 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
933 isa<AllocaInst>(PtrOperand->stripPointerCasts());
934 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
935 Align PrefAlign = Align(StackAdjustedAlignment);
936 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
937 accessIsAllowedAndFast(SizeBytes, AS, PrefAlign, VecElemBits)) {
939 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
940 if (NewAlign >= Alignment) {
942 << "LSV: splitByChain upgrading alloca alignment from "
943 << Alignment.value() << " to " << NewAlign.value()
944 << "\n");
945 Alignment = NewAlign;
946 }
947 }
948
949 Chain ExtendingLoadsStores;
950 if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) {
951 // If we have a non-power-of-2 element count, attempt to extend the
952 // chain to the next power-of-2 if it makes the access allowed and
953 // fast.
954 bool AllowedAndFast = false;
955 if (NumVecElems < TargetVF && !isPowerOf2_32(NumVecElems) &&
956 VecElemBits >= 8) {
957 // TargetVF may be a lot higher than NumVecElems,
958 // so only extend to the next power of 2.
959 assert(VecElemBits % 8 == 0);
960 unsigned VecElemBytes = VecElemBits / 8;
961 unsigned NewNumVecElems = PowerOf2Ceil(NumVecElems);
962 unsigned NewSizeBytes = VecElemBytes * NewNumVecElems;
963
964 assert(isPowerOf2_32(TargetVF) &&
965 "TargetVF expected to be a power of 2");
966 assert(NewNumVecElems <= TargetVF &&
967 "Should not extend past TargetVF");
968
970 << "LSV: attempting to extend chain of " << NumVecElems
971 << " " << (IsLoadChain ? "loads" : "stores") << " to "
972 << NewNumVecElems << " elements\n");
973 bool IsLegalToExtend =
974 IsLoadChain ? TTI.isLegalMaskedLoad(
975 FixedVectorType::get(VecElemTy, NewNumVecElems),
976 Alignment, AS, TTI::MaskKind::ConstantMask)
978 FixedVectorType::get(VecElemTy, NewNumVecElems),
979 Alignment, AS, TTI::MaskKind::ConstantMask);
980 // Only artificially increase the chain if it would be AllowedAndFast
981 // and if the resulting masked load/store will be legal for the
982 // target.
983 if (IsLegalToExtend &&
984 accessIsAllowedAndFast(NewSizeBytes, AS, Alignment,
985 VecElemBits)) {
987 << "LSV: extending " << (IsLoadChain ? "load" : "store")
988 << " chain of " << NumVecElems << " "
989 << (IsLoadChain ? "loads" : "stores")
990 << " with total byte size of " << SizeBytes << " to "
991 << NewNumVecElems << " "
992 << (IsLoadChain ? "loads" : "stores")
993 << " with total byte size of " << NewSizeBytes
994 << ", TargetVF=" << TargetVF << " \n");
995
996 // Create (NewNumVecElems - NumVecElems) extra elements.
997 // We are basing each extra element on CBegin, which means the
998 // offsets should be based on SizeBytes, which represents the offset
999 // from CBegin to the current end of the chain.
1000 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1001 for (unsigned I = 0; I < (NewNumVecElems - NumVecElems); I++) {
1002 ChainElem NewElem = createExtraElementAfter(
1003 C[CBegin], VecElemTy,
1004 APInt(ASPtrBits, SizeBytes + I * VecElemBytes), "Extend");
1005 ExtendingLoadsStores.push_back(NewElem);
1006 }
1007
1008 // Update the size and number of elements for upcoming checks.
1009 SizeBytes = NewSizeBytes;
1010 NumVecElems = NewNumVecElems;
1011 AllowedAndFast = true;
1012 }
1013 }
1014 if (!AllowedAndFast) {
1015 // We were not able to achieve legality by extending the chain.
1017 << "LSV: splitChainByAlignment discarding candidate chain "
1018 "because its alignment is not AllowedAndFast: "
1019 << Alignment.value() << "\n");
1020 continue;
1021 }
1022 }
1023
1024 if ((IsLoadChain &&
1025 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
1026 (!IsLoadChain &&
1027 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
1028 LLVM_DEBUG(
1029 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
1030 "because !isLegalToVectorizeLoad/StoreChain.");
1031 continue;
1032 }
1033
1034 if (CandidateChainsMayContainExtraLoadsStores) {
1035 // If the candidate chain contains extra loads/stores from an earlier
1036 // optimization, confirm legality now. This filter is essential because
1037 // when filling gaps in splitChainByContiguity, we queried the API to
1038 // check that (for a given element type and address space) there *may*
1039 // have been a legal masked load/store we could possibly create. Now, we
1040 // need to check if the actual chain we ended up with is legal to turn
1041 // into a masked load/store. This is relevant for NVPTX, for example,
1042 // where a masked store is only legal if we have ended up with a 256-bit
1043 // vector.
1044 bool CurrCandContainsExtraLoadsStores = llvm::any_of(
1045 ArrayRef<ChainElem>(C).slice(CBegin, CEnd - CBegin + 1),
1046 [this](const ChainElem &E) {
1047 return ExtraElements.contains(E.Inst);
1048 });
1049
1050 if (CurrCandContainsExtraLoadsStores &&
1051 (IsLoadChain ? !TTI.isLegalMaskedLoad(
1052 FixedVectorType::get(VecElemTy, NumVecElems),
1053 Alignment, AS, TTI::MaskKind::ConstantMask)
1055 FixedVectorType::get(VecElemTy, NumVecElems),
1056 Alignment, AS, TTI::MaskKind::ConstantMask))) {
1058 << "LSV: splitChainByAlignment discarding candidate chain "
1059 "because it contains extra loads/stores that we cannot "
1060 "legally vectorize into a masked load/store \n");
1061 continue;
1062 }
1063 }
1064
1065 // Hooray, we can vectorize this chain!
1066 Chain &NewChain = Ret.emplace_back();
1067 for (unsigned I = CBegin; I <= CEnd; ++I)
1068 NewChain.emplace_back(C[I]);
1069 for (ChainElem E : ExtendingLoadsStores)
1070 NewChain.emplace_back(E);
1071 CBegin = CEnd; // Skip over the instructions we've added to the chain.
1072 break;
1073 }
1074 }
1075 return Ret;
1076}
1077
1078bool Vectorizer::vectorizeChain(Chain &C) {
1079 if (C.size() < 2)
1080 return false;
1081
1082 bool ChainContainsExtraLoadsStores = llvm::any_of(
1083 C, [this](const ChainElem &E) { return ExtraElements.contains(E.Inst); });
1084
1085 // If we are left with a two-element chain, and one of the elements is an
1086 // extra element, we don't want to vectorize
1087 if (C.size() == 2 && ChainContainsExtraLoadsStores)
1088 return false;
1089
1090 sortChainInOffsetOrder(C);
1091
1092 LLVM_DEBUG({
1093 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
1094 dumpChain(C);
1095 });
1096
1097 Type *VecElemTy = getChainElemTy(C);
1098 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
1099 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
1100 unsigned BytesAdded = DL.getTypeStoreSize(getLoadStoreType(&*C[0].Inst));
1101 APInt PrevReadEnd = C[0].OffsetFromLeader + BytesAdded;
1102 unsigned ChainBytes = BytesAdded;
1103 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
1104 unsigned SzBytes = DL.getTypeStoreSize(getLoadStoreType(&*It->Inst));
1105 APInt ReadEnd = It->OffsetFromLeader + SzBytes;
1106 // Update ChainBytes considering possible overlap.
1107 BytesAdded =
1108 PrevReadEnd.sle(ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue() : 0;
1109 ChainBytes += BytesAdded;
1110 PrevReadEnd = APIntOps::smax(PrevReadEnd, ReadEnd);
1111 }
1112
1113 assert(8 * ChainBytes % DL.getTypeSizeInBits(VecElemTy) == 0);
1114 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
1115 // than 1 byte (e.g. VecTy == <32 x i1>).
1116 unsigned NumElem = 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy);
1117 Type *VecTy = FixedVectorType::get(VecElemTy, NumElem);
1118
1119 Align Alignment = getLoadStoreAlignment(C[0].Inst);
1120 // If this is a load/store of an alloca, we might have upgraded the alloca's
1121 // alignment earlier. Get the new alignment.
1122 if (AS == DL.getAllocaAddrSpace()) {
1123 Alignment = std::max(
1124 Alignment,
1126 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
1127 }
1128
1129 // All elements of the chain must have the same scalar-type size.
1130#ifndef NDEBUG
1131 for (const ChainElem &E : C)
1132 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
1133 DL.getTypeStoreSize(VecElemTy));
1134#endif
1135
1136 Instruction *VecInst;
1137 if (IsLoadChain) {
1138 // Loads get hoisted to the location of the first load in the chain. We may
1139 // also need to hoist the (transitive) operands of the loads.
1140 Builder.SetInsertPoint(
1141 llvm::min_element(C, [](const auto &A, const auto &B) {
1142 return A.Inst->comesBefore(B.Inst);
1143 })->Inst);
1144
1145 // If the chain contains extra loads, we need to vectorize into a
1146 // masked load.
1147 if (ChainContainsExtraLoadsStores) {
1148 assert(TTI.isLegalMaskedLoad(VecTy, Alignment, AS,
1150 Value *Mask = createMaskForExtraElements(C, cast<FixedVectorType>(VecTy));
1151 VecInst = Builder.CreateMaskedLoad(
1152 VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask);
1153 } else {
1154 // This can happen due to a chain of redundant loads.
1155 // In this case, just use the element-type, and avoid ExtractElement.
1156 if (NumElem == 1)
1157 VecTy = VecElemTy;
1158 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1159 // i.e. the root of the vector.
1160 VecInst = Builder.CreateAlignedLoad(
1161 VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment);
1162 }
1163
1164 for (const ChainElem &E : C) {
1165 Instruction *I = E.Inst;
1166 Value *V;
1168 unsigned EOffset =
1169 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1170 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
1171 if (!VecTy->isVectorTy()) {
1172 V = VecInst;
1173 } else if (auto *VT = dyn_cast<FixedVectorType>(T)) {
1174 auto Mask = llvm::to_vector<8>(
1175 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
1176 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
1177 } else {
1178 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
1179 I->getName());
1180 }
1181 if (V->getType() != I->getType())
1182 V = Builder.CreateBitOrPointerCast(V, I->getType());
1184 }
1185
1186 // Finally, we need to reorder the instrs in the BB so that the (transitive)
1187 // operands of VecInst appear before it. To see why, suppose we have
1188 // vectorized the following code:
1189 //
1190 // ptr1 = gep a, 1
1191 // load1 = load i32 ptr1
1192 // ptr0 = gep a, 0
1193 // load0 = load i32 ptr0
1194 //
1195 // We will put the vectorized load at the location of the earliest load in
1196 // the BB, i.e. load1. We get:
1197 //
1198 // ptr1 = gep a, 1
1199 // loadv = load <2 x i32> ptr0
1200 // load0 = extractelement loadv, 0
1201 // load1 = extractelement loadv, 1
1202 // ptr0 = gep a, 0
1203 //
1204 // Notice that loadv uses ptr0, which is defined *after* it!
1205 reorder(VecInst);
1206 } else {
1207 // Stores get sunk to the location of the last store in the chain.
1208 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
1209 return A.Inst->comesBefore(B.Inst);
1210 })->Inst);
1211
1212 // Build the vector to store.
1213 Value *Vec = PoisonValue::get(VecTy);
1214 auto InsertElem = [&](Value *V, unsigned VecIdx) {
1215 if (V->getType() != VecElemTy)
1216 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
1217 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx));
1218 };
1219 for (const ChainElem &E : C) {
1220 auto *I = cast<StoreInst>(E.Inst);
1221 unsigned EOffset =
1222 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1223 unsigned VecIdx = 8 * EOffset / DL.getTypeSizeInBits(VecElemTy);
1224 if (FixedVectorType *VT =
1226 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
1227 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
1228 Builder.getInt32(J)),
1229 VecIdx++);
1230 }
1231 } else {
1232 InsertElem(I->getValueOperand(), VecIdx);
1233 }
1234 }
1235
1236 // If the chain originates from extra stores, we need to vectorize into a
1237 // masked store.
1238 if (ChainContainsExtraLoadsStores) {
1239 assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS,
1241 Value *Mask =
1242 createMaskForExtraElements(C, cast<FixedVectorType>(Vec->getType()));
1243 VecInst = Builder.CreateMaskedStore(
1244 Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask);
1245 } else {
1246 // Chain is in offset order, so C[0] is the instr with the lowest offset,
1247 // i.e. the root of the vector.
1248 VecInst = Builder.CreateAlignedStore(
1249 Vec, getLoadStorePointerOperand(C[0].Inst), Alignment);
1250 }
1251 }
1252
1253 propagateMetadata(VecInst, C);
1254
1255 for (const ChainElem &E : C)
1256 ToErase.emplace_back(E.Inst);
1257
1258 ++NumVectorInstructions;
1259 NumScalarsVectorized += C.size();
1260 return true;
1261}
1262
1263template <bool IsLoadChain>
1264bool Vectorizer::isSafeToMove(
1265 Instruction *ChainElem, Instruction *ChainBegin,
1266 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1267 BatchAAResults &BatchAA) {
1268 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1269 << *ChainBegin << ")\n");
1270
1271 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1272 if (ChainElem == ChainBegin)
1273 return true;
1274
1275 // Invariant loads can always be reordered; by definition they are not
1276 // clobbered by stores.
1277 if (isInvariantLoad(ChainElem))
1278 return true;
1279
1280 auto BBIt = std::next([&] {
1281 if constexpr (IsLoadChain)
1282 return BasicBlock::reverse_iterator(ChainElem);
1283 else
1284 return BasicBlock::iterator(ChainElem);
1285 }());
1286 auto BBItEnd = std::next([&] {
1287 if constexpr (IsLoadChain)
1288 return BasicBlock::reverse_iterator(ChainBegin);
1289 else
1290 return BasicBlock::iterator(ChainBegin);
1291 }());
1292
1293 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1294 const unsigned ChainElemSize =
1295 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1296
1297 for (; BBIt != BBItEnd; ++BBIt) {
1298 Instruction *I = &*BBIt;
1299
1300 if (!I->mayReadOrWriteMemory())
1301 continue;
1302
1303 // Loads can be reordered with other loads.
1304 if (IsLoadChain && isa<LoadInst>(I))
1305 continue;
1306
1307 // Stores can be sunk below invariant loads.
1308 if (!IsLoadChain && isInvariantLoad(I))
1309 continue;
1310
1311 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1312 // what offset ChainIt accesses. This may be better than AA is able to do.
1313 //
1314 // We should really only have duplicate offsets for stores (the duplicate
1315 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1316 // split the chain so we don't have to handle this case specially.
1317 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1318 // I and ChainElem overlap if:
1319 // - I and ChainElem have the same offset, OR
1320 // - I's offset is less than ChainElem's, but I touches past the
1321 // beginning of ChainElem, OR
1322 // - ChainElem's offset is less than I's, but ChainElem touches past the
1323 // beginning of I.
1324 const APInt &IOffset = OffsetIt->second;
1325 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1326 if (IOffset == ChainElemOffset ||
1327 (IOffset.sle(ChainElemOffset) &&
1328 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1329 (ChainElemOffset.sle(IOffset) &&
1330 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1331 LLVM_DEBUG({
1332 // Double check that AA also sees this alias. If not, we probably
1333 // have a bug.
1334 ModRefInfo MR =
1335 BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1336 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1337 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1338 });
1339 return false; // We found an aliasing instruction; bail.
1340 }
1341
1342 continue; // We're confident there's no alias.
1343 }
1344
1345 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1346 ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1347 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1348 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1349 << " Aliasing instruction:\n"
1350 << " " << *I << '\n'
1351 << " Aliased instruction and pointer:\n"
1352 << " " << *ChainElem << '\n'
1353 << " " << *getLoadStorePointerOperand(ChainElem)
1354 << '\n');
1355
1356 return false;
1357 }
1358 }
1359 return true;
1360}
1361
1364 return (Signed && BinOpI->hasNoSignedWrap()) ||
1365 (!Signed && BinOpI->hasNoUnsignedWrap());
1366}
1367
1368static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1369 unsigned MatchingOpIdxA, Instruction *AddOpB,
1370 unsigned MatchingOpIdxB, bool Signed) {
1371 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1372 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1373 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1374 << ", MatchingOpIdxB=" << MatchingOpIdxB
1375 << ", Signed=" << Signed << "\n");
1376 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1377 // being the same, we can guarantee that the transformation is safe if we can
1378 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1379 // For example:
1380 // %tmp7 = add nsw i32 %tmp2, %v0
1381 // %tmp8 = sext i32 %tmp7 to i64
1382 // ...
1383 // %tmp11 = add nsw i32 %v0, 1
1384 // %tmp12 = add nsw i32 %tmp2, %tmp11
1385 // %tmp13 = sext i32 %tmp12 to i64
1386 //
1387 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1388 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1389 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1390 assert(AddOpA->getOpcode() == Instruction::Add &&
1391 AddOpB->getOpcode() == Instruction::Add &&
1392 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1393 if (AddOpA->getOperand(MatchingOpIdxA) ==
1394 AddOpB->getOperand(MatchingOpIdxB)) {
1395 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1396 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1397 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1398 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1399 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1400 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1401 checkNoWrapFlags(OtherInstrB, Signed) &&
1402 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1403 int64_t CstVal =
1404 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1405 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1406 IdxDiff.getSExtValue() == CstVal)
1407 return true;
1408 }
1409 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1410 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1411 checkNoWrapFlags(OtherInstrA, Signed) &&
1412 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1413 int64_t CstVal =
1414 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1415 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1416 IdxDiff.getSExtValue() == -CstVal)
1417 return true;
1418 }
1419 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1420 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1421 if (OtherInstrA && OtherInstrB &&
1422 OtherInstrA->getOpcode() == Instruction::Add &&
1423 OtherInstrB->getOpcode() == Instruction::Add &&
1424 checkNoWrapFlags(OtherInstrA, Signed) &&
1425 checkNoWrapFlags(OtherInstrB, Signed) &&
1426 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1427 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1428 int64_t CstValA =
1429 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1430 int64_t CstValB =
1431 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1432 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1433 IdxDiff.getSExtValue() == (CstValB - CstValA))
1434 return true;
1435 }
1436 }
1437 return false;
1438}
1439
1440std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1441 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1442 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1443 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1444 << " Depth=" << Depth << "\n");
1445 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1446 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1447 if (!GEPA || !GEPB)
1448 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1449
1450 // Look through GEPs after checking they're the same except for the last
1451 // index.
1452 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1453 GEPA->getPointerOperand() != GEPB->getPointerOperand() ||
1454 GEPA->getSourceElementType() != GEPB->getSourceElementType())
1455 return std::nullopt;
1456 gep_type_iterator GTIA = gep_type_begin(GEPA);
1457 gep_type_iterator GTIB = gep_type_begin(GEPB);
1458 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1459 if (GTIA.getOperand() != GTIB.getOperand())
1460 return std::nullopt;
1461 ++GTIA;
1462 ++GTIB;
1463 }
1464
1467 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1468 OpA->getType() != OpB->getType())
1469 return std::nullopt;
1470
1471 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1472
1473 // Only look through a ZExt/SExt.
1474 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1475 return std::nullopt;
1476
1477 bool Signed = isa<SExtInst>(OpA);
1478
1479 // At this point A could be a function parameter, i.e. not an instruction
1480 Value *ValA = OpA->getOperand(0);
1481 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1482 if (!OpB || ValA->getType() != OpB->getType())
1483 return std::nullopt;
1484
1485 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1486 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1487 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1488 if (IdxDiffSCEV == SE.getCouldNotCompute())
1489 return std::nullopt;
1490
1491 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1492 if (!IdxDiffRange.isSingleElement())
1493 return std::nullopt;
1494 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1495
1496 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1497 << "\n");
1498
1499 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1500 bool Safe = false;
1501
1502 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1503 // ValA, we're okay.
1504 if (OpB->getOpcode() == Instruction::Add &&
1505 isa<ConstantInt>(OpB->getOperand(1)) &&
1506 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1508 Safe = true;
1509
1510 // Second attempt: check if we have eligible add NSW/NUW instruction
1511 // sequences.
1512 OpA = dyn_cast<Instruction>(ValA);
1513 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1514 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1515 checkNoWrapFlags(OpB, Signed)) {
1516 // In the checks below a matching operand in OpA and OpB is an operand which
1517 // is the same in those two instructions. Below we account for possible
1518 // orders of the operands of these add instructions.
1519 for (unsigned MatchingOpIdxA : {0, 1})
1520 for (unsigned MatchingOpIdxB : {0, 1})
1521 if (!Safe)
1522 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1523 MatchingOpIdxB, Signed);
1524 }
1525
1526 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1527
1528 // Third attempt:
1529 //
1530 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1531 // order bit other than the sign bit are known to be zero in ValA, we can add
1532 // Diff to it while guaranteeing no overflow of any sort.
1533 //
1534 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1535 if (!Safe) {
1536 // When computing known bits, use the GEPs as context instructions, since
1537 // they likely are in the same BB as the load/store.
1538 KnownBits Known(BitWidth);
1539 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, &AC, ContextInst,
1540 &DT);
1541 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1542 if (Signed)
1543 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1544 Safe = BitsAllowedToBeSet.uge(IdxDiff.abs());
1545 }
1546
1547 if (Safe)
1548 return IdxDiff * Stride;
1549 return std::nullopt;
1550}
1551
1552std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1553 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1554 if (Depth++ == MaxDepth)
1555 return std::nullopt;
1556
1557 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1558 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1559 if (SelectA->getCondition() != SelectB->getCondition())
1560 return std::nullopt;
1561 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1562 << ", PtrB=" << *PtrB << ", ContextInst="
1563 << *ContextInst << ", Depth=" << Depth << "\n");
1564 std::optional<APInt> TrueDiff = getConstantOffset(
1565 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1566 if (!TrueDiff)
1567 return std::nullopt;
1568 std::optional<APInt> FalseDiff =
1569 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1570 ContextInst, Depth);
1571 if (TrueDiff == FalseDiff)
1572 return TrueDiff;
1573 }
1574 }
1575 return std::nullopt;
1576}
1577
1578void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1579 if (EQClasses.size() < 2) // There is nothing to merge.
1580 return;
1581
1582 // The reduced key has all elements of the ECClassKey except the underlying
1583 // object. Check that EqClassKey has 4 elements and define the reduced key.
1584 static_assert(std::tuple_size_v<EqClassKey> == 4,
1585 "EqClassKey has changed - EqClassReducedKey needs changes too");
1586 using EqClassReducedKey =
1587 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1588 std::tuple_element_t<2, EqClassKey> /* Element size */,
1589 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1590 using ECReducedKeyToUnderlyingObjectMap =
1591 MapVector<EqClassReducedKey,
1592 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1593
1594 // Form a map from the reduced key (without the underlying object) to the
1595 // underlying objects: 1 reduced key to many underlying objects, to form
1596 // groups of potentially merge-able equivalence classes.
1597 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1598 bool FoundPotentiallyOptimizableEC = false;
1599 for (const auto &EC : EQClasses) {
1600 const auto &Key = EC.first;
1601 EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1602 std::get<3>(Key)};
1603 auto &UOMap = RedKeyToUOMap[RedKey];
1604 UOMap.insert(std::get<0>(Key));
1605 if (UOMap.size() > 1)
1606 FoundPotentiallyOptimizableEC = true;
1607 }
1608 if (!FoundPotentiallyOptimizableEC)
1609 return;
1610
1611 LLVM_DEBUG({
1612 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1613 for (const auto &EC : EQClasses) {
1614 dbgs() << " Key: {" << EC.first << "}\n";
1615 for (const auto &Inst : EC.second)
1616 dbgs() << " Inst: " << *Inst << '\n';
1617 }
1618 });
1619 LLVM_DEBUG({
1620 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1621 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1622 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1623 << std::get<1>(RedKeyToUO.first) << ", "
1624 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1625 << RedKeyToUO.second.size() << " underlying objects:\n";
1626 for (auto UObject : RedKeyToUO.second)
1627 dbgs() << " " << *UObject << '\n';
1628 }
1629 });
1630
1631 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1632
1633 // Compute the ultimate targets for a set of underlying objects.
1634 auto GetUltimateTargets =
1635 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1636 UObjectToUObjectMap IndirectionMap;
1637 for (const auto *UObject : UObjects) {
1638 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1639 const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
1640 if (UltimateTarget != UObject)
1641 IndirectionMap[UObject] = UltimateTarget;
1642 }
1643 UObjectToUObjectMap UltimateTargetsMap;
1644 for (const auto *UObject : UObjects) {
1645 auto Target = UObject;
1646 auto It = IndirectionMap.find(Target);
1647 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
1648 Target = It->second;
1649 UltimateTargetsMap[UObject] = Target;
1650 }
1651 return UltimateTargetsMap;
1652 };
1653
1654 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1655 // try to merge the equivalence classes.
1656 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1657 if (UObjects.size() < 2)
1658 continue;
1659 auto UTMap = GetUltimateTargets(UObjects);
1660 for (const auto &[UObject, UltimateTarget] : UTMap) {
1661 if (UObject == UltimateTarget)
1662 continue;
1663
1664 EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1665 std::get<2>(RedKey)};
1666 EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1667 std::get<2>(RedKey)};
1668 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1669 // request the reference to the instructions vector for KeyTo first.
1670 const auto &VecTo = EQClasses[KeyTo];
1671 const auto &VecFrom = EQClasses[KeyFrom];
1672 SmallVector<Instruction *, 8> MergedVec;
1673 std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1674 std::back_inserter(MergedVec),
1675 [](Instruction *A, Instruction *B) {
1676 return A && B && A->comesBefore(B);
1677 });
1678 EQClasses[KeyTo] = std::move(MergedVec);
1679 EQClasses.erase(KeyFrom);
1680 }
1681 }
1682 LLVM_DEBUG({
1683 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1684 for (const auto &EC : EQClasses) {
1685 dbgs() << " Key: {" << EC.first << "}\n";
1686 for (const auto &Inst : EC.second)
1687 dbgs() << " Inst: " << *Inst << '\n';
1688 }
1689 });
1690}
1691
1692EquivalenceClassMap
1693Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1695 EquivalenceClassMap Ret;
1696
1697 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1698 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1699 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1700 // The select's themselves are distinct instructions even if they share
1701 // the same condition and evaluate to consecutive pointers for true and
1702 // false values of the condition. Therefore using the select's themselves
1703 // for grouping instructions would put consecutive accesses into different
1704 // lists and they won't be even checked for being consecutive, and won't
1705 // be vectorized.
1706 return Sel->getCondition();
1707 }
1708 return ObjPtr;
1709 };
1710
1711 for (Instruction &I : make_range(Begin, End)) {
1712 auto *LI = dyn_cast<LoadInst>(&I);
1713 auto *SI = dyn_cast<StoreInst>(&I);
1714 if (!LI && !SI)
1715 continue;
1716
1717 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1718 continue;
1719
1720 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1721 (SI && !TTI.isLegalToVectorizeStore(SI)))
1722 continue;
1723
1724 Type *Ty = getLoadStoreType(&I);
1725 if (!VectorType::isValidElementType(Ty->getScalarType()))
1726 continue;
1727
1728 // Skip weird non-byte sizes. They probably aren't worth the effort of
1729 // handling correctly.
1730 unsigned TySize = DL.getTypeSizeInBits(Ty);
1731 if ((TySize % 8) != 0)
1732 continue;
1733
1734 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1735 // functions are currently using an integer type for the vectorized
1736 // load/store, and does not support casting between the integer type and a
1737 // vector of pointers (e.g. i64 to <2 x i16*>)
1738 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1739 continue;
1740
1742 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1743 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1744
1745 unsigned VF = VecRegSize / TySize;
1746 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1747
1748 // Only handle power-of-two sized elements.
1749 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1750 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1751 continue;
1752
1753 // No point in looking at these if they're too big to vectorize.
1754 if (TySize > VecRegSize / 2 ||
1755 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1756 continue;
1757
1758 Ret[{GetUnderlyingObject(Ptr), AS,
1759 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1760 /*IsLoad=*/LI != nullptr}]
1761 .emplace_back(&I);
1762 }
1763
1764 mergeEquivalenceClasses(Ret);
1765 return Ret;
1766}
1767
1768std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1769 if (Instrs.empty())
1770 return {};
1771
1772 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1773 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1774
1775#ifndef NDEBUG
1776 // Check that Instrs is in BB order and all have the same addr space.
1777 for (size_t I = 1; I < Instrs.size(); ++I) {
1778 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1779 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1780 }
1781#endif
1782
1783 // Machinery to build an MRU-hashtable of Chains.
1784 //
1785 // (Ideally this could be done with MapVector, but as currently implemented,
1786 // moving an element to the front of a MapVector is O(n).)
1787 struct InstrListElem : ilist_node<InstrListElem>,
1788 std::pair<Instruction *, Chain> {
1789 explicit InstrListElem(Instruction *I)
1790 : std::pair<Instruction *, Chain>(I, {}) {}
1791 };
1792 struct InstrListElemDenseMapInfo {
1793 using PtrInfo = DenseMapInfo<InstrListElem *>;
1794 using IInfo = DenseMapInfo<Instruction *>;
1795 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1796 static InstrListElem *getTombstoneKey() {
1797 return PtrInfo::getTombstoneKey();
1798 }
1799 static unsigned getHashValue(const InstrListElem *E) {
1800 return IInfo::getHashValue(E->first);
1801 }
1802 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1803 if (A == getEmptyKey() || B == getEmptyKey())
1804 return A == getEmptyKey() && B == getEmptyKey();
1805 if (A == getTombstoneKey() || B == getTombstoneKey())
1806 return A == getTombstoneKey() && B == getTombstoneKey();
1807 return IInfo::isEqual(A->first, B->first);
1808 }
1809 };
1810 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1811 simple_ilist<InstrListElem> MRU;
1812 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1813
1814 // Compare each instruction in `instrs` to leader of the N most recently-used
1815 // chains. This limits the O(n^2) behavior of this pass while also allowing
1816 // us to build arbitrarily long chains.
1817 for (Instruction *I : Instrs) {
1818 constexpr int MaxChainsToTry = 64;
1819
1820 bool MatchFound = false;
1821 auto ChainIter = MRU.begin();
1822 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1823 ++J, ++ChainIter) {
1824 if (std::optional<APInt> Offset = getConstantOffset(
1825 getLoadStorePointerOperand(ChainIter->first),
1827 /*ContextInst=*/
1828 (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1829 // `Offset` might not have the expected number of bits, if e.g. AS has a
1830 // different number of bits than opaque pointers.
1831 ChainIter->second.emplace_back(I, Offset.value());
1832 // Move ChainIter to the front of the MRU list.
1833 MRU.remove(*ChainIter);
1834 MRU.push_front(*ChainIter);
1835 MatchFound = true;
1836 break;
1837 }
1838 }
1839
1840 if (!MatchFound) {
1841 APInt ZeroOffset(ASPtrBits, 0);
1842 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1843 E->second.emplace_back(I, ZeroOffset);
1844 MRU.push_front(*E);
1845 Chains.insert(E);
1846 }
1847 }
1848
1849 std::vector<Chain> Ret;
1850 Ret.reserve(Chains.size());
1851 // Iterate over MRU rather than Chains so the order is deterministic.
1852 for (auto &E : MRU)
1853 if (E.second.size() > 1)
1854 Ret.emplace_back(std::move(E.second));
1855 return Ret;
1856}
1857
1858std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1859 Instruction *ContextInst,
1860 unsigned Depth) {
1861 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1862 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1863 << ", Depth=" << Depth << "\n");
1864 // We'll ultimately return a value of this bit width, even if computations
1865 // happen in a different width.
1866 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1867 APInt OffsetA(OrigBitWidth, 0);
1868 APInt OffsetB(OrigBitWidth, 0);
1869 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1870 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1871 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1872 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1873 return std::nullopt;
1874
1875 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1876 // should properly handle a possible overflow and the value should fit into
1877 // the smallest data type used in the cast/gep chain.
1878 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1879 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1880
1881 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1882 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1883 if (PtrA == PtrB)
1884 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1885
1886 // Try to compute B - A.
1887 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1888 if (DistScev != SE.getCouldNotCompute()) {
1889 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1890 ConstantRange DistRange = SE.getSignedRange(DistScev);
1891 if (DistRange.isSingleElement()) {
1892 // Handle index width (the width of Dist) != pointer width (the width of
1893 // the Offset*s at this point).
1894 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1895 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1896 }
1897 }
1898 if (std::optional<APInt> Diff =
1899 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1900 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1901 .sextOrTrunc(OrigBitWidth);
1902 return std::nullopt;
1903}
1904
1905bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS,
1906 Align Alignment,
1907 unsigned VecElemBits) const {
1908 // Aligned vector accesses are ALWAYS faster than element-wise accesses.
1909 if (Alignment.value() % SizeBytes == 0)
1910 return true;
1911
1912 // Ask TTI whether misaligned accesses are faster as vector or element-wise.
1913 unsigned VectorizedSpeed = 0;
1914 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
1915 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
1916 if (!AllowsMisaligned) {
1917 LLVM_DEBUG(
1918 dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS
1919 << " with alignment " << Alignment.value()
1920 << " is misaligned, and therefore can't be vectorized.\n");
1921 return false;
1922 }
1923
1924 unsigned ElementwiseSpeed = 0;
1925 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
1926 Alignment, &ElementwiseSpeed);
1927 if (VectorizedSpeed < ElementwiseSpeed) {
1928 LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace "
1929 << AS << " with alignment " << Alignment.value()
1930 << " has relative speed " << VectorizedSpeed
1931 << ", which is lower than the elementwise speed of "
1932 << ElementwiseSpeed
1933 << ". Therefore this access won't be vectorized.\n");
1934 return false;
1935 }
1936 return true;
1937}
1938
1939ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev, Type *Ty,
1940 APInt Offset, StringRef Prefix,
1941 Align Alignment) {
1942 Instruction *NewElement = nullptr;
1943 Builder.SetInsertPoint(Prev.Inst->getNextNode());
1944 if (LoadInst *PrevLoad = dyn_cast<LoadInst>(Prev.Inst)) {
1945 Value *NewGep = Builder.CreatePtrAdd(
1946 PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
1947 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1948 NewElement = Builder.CreateAlignedLoad(Ty, NewGep, Alignment, Prefix);
1949 } else {
1950 StoreInst *PrevStore = cast<StoreInst>(Prev.Inst);
1951
1952 Value *NewGep = Builder.CreatePtrAdd(
1953 PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
1954 LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
1955 NewElement =
1956 Builder.CreateAlignedStore(PoisonValue::get(Ty), NewGep, Alignment);
1957 }
1958
1959 // Attach all metadata to the new element.
1960 // propagateMetadata will fold it into the final vector when applicable.
1961 NewElement->copyMetadata(*Prev.Inst);
1962
1963 // Cache created elements for tracking and cleanup
1964 ExtraElements.insert(NewElement);
1965
1966 APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset;
1967 LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n"
1968 << *NewElement
1969 << " OffsetFromLeader: " << NewOffsetFromLeader << "\n");
1970 return ChainElem{NewElement, NewOffsetFromLeader};
1971}
1972
1973Value *Vectorizer::createMaskForExtraElements(const ArrayRef<ChainElem> C,
1974 FixedVectorType *VecTy) {
1975 // Start each mask element as false
1977 Builder.getInt1(false));
1978 // Iterate over the chain and set the corresponding mask element to true for
1979 // each element that is not an extra element.
1980 for (const ChainElem &E : C) {
1981 if (ExtraElements.contains(E.Inst))
1982 continue;
1983 unsigned EOffset =
1984 (E.OffsetFromLeader - C[0].OffsetFromLeader).getZExtValue();
1985 unsigned VecIdx =
1986 8 * EOffset / DL.getTypeSizeInBits(VecTy->getScalarType());
1987 if (FixedVectorType *VT =
1989 for (unsigned J = 0; J < VT->getNumElements(); ++J)
1990 MaskElts[VecIdx + J] = Builder.getInt1(true);
1991 else
1992 MaskElts[VecIdx] = Builder.getInt1(true);
1993 }
1994 return ConstantVector::get(MaskElts);
1995}
1996
1997void Vectorizer::deleteExtraElements() {
1998 for (auto *ExtraElement : ExtraElements) {
1999 if (isa<LoadInst>(ExtraElement)) {
2000 [[maybe_unused]] bool Deleted =
2002 assert(Deleted && "Extra Load should always be trivially dead");
2003 } else {
2004 // Unlike Extra Loads, Extra Stores won't be "dead", but should all be
2005 // deleted regardless. They will have either been combined into a masked
2006 // store, or will be left behind and need to be cleaned up.
2007 auto *PtrOperand = getLoadStorePointerOperand(ExtraElement);
2008 ExtraElement->eraseFromParent();
2010 }
2011 }
2012
2013 ExtraElements.clear();
2014}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define T
static bool isInvariantLoad(const Instruction *I, const Value *Ptr, const bool IsKernelFn)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
Basic Register Allocator
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
This pass exposes codegen information to IR-level passes.
static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, const MachineRegisterInfo &MRI, bool Optimize)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition APInt.h:78
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1421
APInt abs() const
Get the absolute value.
Definition APInt.h:1810
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1173
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:316
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Legacy wrapper pass to provide the GlobalsAAResult object.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2584
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2572
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1894
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Value * CreatePtrAdd(Value *Ptr, Value *Offset, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2048
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2281
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2606
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1913
ConstantInt * getInt(const APInt &AI)
Get a constant integer value.
Definition IRBuilder.h:537
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
An instruction for reading from memory.
bool isSimple() const
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:124
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Legacy wrapper pass to provide the SCEVAAResult object.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI const SCEV * getCouldNotCompute()
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Value * getPointerOperand()
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool isLegalToVectorizeLoad(LoadInst *LI) const
LLVM_ABI bool isLegalToVectorizeStore(StoreInst *SI) const
LLVM_ABI bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, MaskKind MaskKind=VariableOrConstantMask) const
Return true if the target supports masked store.
LLVM_ABI unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
LLVM_ABI unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
LLVM_ABI unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
LLVM_ABI bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, MaskKind MaskKind=VariableOrConstantMask) const
Return true if the target supports masked load.
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:287
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:761
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:713
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
TypeSize getSequentialElementStride(const DataLayout &DL) const
const ParentTy * getParent() const
Definition ilist_node.h:34
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
void push_front(reference Node)
Insert a node at the front; never copies.
void remove(reference N)
Remove a node by reference; never deletes.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2268
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:68
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition APFloat.h:1630
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
iterator_range< po_iterator< T > > post_order(const T &G)
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition Local.cpp:1581
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
generic_gep_type_iterator<> gep_type_iterator
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2088
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77