114#include <type_traits>
120#define DEBUG_TYPE "load-store-vectorizer"
122STATISTIC(NumVectorInstructions,
"Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized,
"Number of scalar accesses vectorized");
133 std::tuple<
const Value * ,
139 const EqClassKey &K) {
142 <<
" of element size " << ElementSize <<
" bits in addrspace "
159 APInt OffsetFromLeader;
161 : Inst(
std::
move(Inst)), OffsetFromLeader(
std::
move(OffsetFromLeader)) {}
165void sortChainInBBOrder(Chain &
C) {
166 sort(
C, [](
auto &
A,
auto &
B) {
return A.Inst->comesBefore(
B.Inst); });
169void sortChainInOffsetOrder(Chain &
C) {
170 sort(
C, [](
const auto &
A,
const auto &
B) {
171 if (
A.OffsetFromLeader !=
B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(
B.OffsetFromLeader);
173 return A.Inst->comesBefore(
B.Inst);
178 for (
const auto &E :
C) {
179 dbgs() <<
" " << *E.Inst <<
" (offset " << E.OffsetFromLeader <<
")\n";
183using EquivalenceClassMap =
187constexpr unsigned StackAdjustedAlignment = 4;
191 for (
const ChainElem &E :
C)
197 const LoadInst *LI = dyn_cast<LoadInst>(
I);
198 return LI !=
nullptr && LI->
hasMetadata(LLVMContext::MD_invariant_load);
208 while (!Worklist.
empty()) {
211 for (
int Idx = 0;
Idx < NumOperands;
Idx++) {
213 if (!IM || IM->
getOpcode() == Instruction::PHI)
221 assert(IM !=
I &&
"Unexpected cycle while re-ordering instructions");
224 InstructionsToMove.
insert(IM);
231 for (
auto BBI =
I->getIterator(), E =
I->getParent()->end(); BBI != E;) {
233 if (!InstructionsToMove.
contains(IM))
258 :
F(
F), AA(AA), AC(AC), DT(DT), SE(SE),
TTI(
TTI),
259 DL(
F.getDataLayout()), Builder(SE.getContext()) {}
264 static const unsigned MaxDepth = 3;
273 bool runOnEquivalenceClass(
const EqClassKey &EqClassKey,
279 bool runOnChain(Chain &
C);
284 std::vector<Chain> splitChainByContiguity(Chain &
C);
289 std::vector<Chain> splitChainByMayAliasInstrs(Chain &
C);
293 std::vector<Chain> splitChainByAlignment(Chain &
C);
297 bool vectorizeChain(Chain &
C);
300 std::optional<APInt> getConstantOffset(
Value *PtrA,
Value *PtrB,
303 std::optional<APInt> getConstantOffsetComplexAddrs(
Value *PtrA,
Value *PtrB,
306 std::optional<APInt> getConstantOffsetSelects(
Value *PtrA,
Value *PtrB,
313 Type *getChainElemTy(
const Chain &
C);
322 template <
bool IsLoadChain>
330 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses)
const;
348class LoadStoreVectorizerLegacyPass :
public FunctionPass {
360 return "GPU Load and Store Vectorizer";
375char LoadStoreVectorizerLegacyPass::ID = 0;
378 "Vectorize load and Store instructions",
false,
false)
389 return new LoadStoreVectorizerLegacyPass();
392bool LoadStoreVectorizerLegacyPass::runOnFunction(
Function &
F) {
394 if (skipFunction(
F) ||
F.hasFnAttribute(Attribute::NoImplicitFloat))
397 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
398 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
399 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
401 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
404 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
406 return Vectorizer(
F, AA, AC, DT, SE,
TTI).run();
412 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
421 bool Changed = Vectorizer(
F, AA, AC, DT, SE,
TTI).
run();
427bool Vectorizer::run() {
428 bool Changed =
false;
454 for (
auto It = Barriers.
begin(),
End = std::prev(Barriers.
end()); It !=
End;
456 Changed |= runOnPseudoBB(*It, *std::next(It));
461 I->eraseFromParent();
473 dbgs() <<
"LSV: Running on pseudo-BB [" << *Begin <<
" ... ";
474 if (
End != Begin->getParent()->end())
477 dbgs() <<
"<BB end>";
481 bool Changed =
false;
482 for (
const auto &[EqClassKey, EqClass] :
483 collectEquivalenceClasses(Begin,
End))
484 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
489bool Vectorizer::runOnEquivalenceClass(
const EqClassKey &EqClassKey,
491 bool Changed =
false;
494 dbgs() <<
"LSV: Running on equivalence class of size " << EqClass.
size()
495 <<
" keyed on " << EqClassKey <<
":\n";
497 dbgs() <<
" " << *
I <<
"\n";
500 std::vector<Chain> Chains = gatherChains(EqClass);
502 <<
" nontrivial chains.\n";);
503 for (Chain &
C : Chains)
504 Changed |= runOnChain(
C);
508bool Vectorizer::runOnChain(Chain &
C) {
510 dbgs() <<
"LSV: Running on chain with " <<
C.size() <<
" instructions:\n";
520 bool Changed =
false;
521 for (
auto &
C : splitChainByMayAliasInstrs(
C))
522 for (
auto &
C : splitChainByContiguity(
C))
523 for (
auto &
C : splitChainByAlignment(
C))
524 Changed |= vectorizeChain(
C);
528std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &
C) {
532 sortChainInBBOrder(
C);
535 dbgs() <<
"LSV: splitChainByMayAliasInstrs considering chain:\n";
543 for (
const auto &E :
C)
544 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
557 auto Impl = [&](
auto IsLoad) {
559 auto [ChainBegin, ChainEnd] = [&](
auto IsLoad) {
560 if constexpr (IsLoad())
561 return std::make_pair(
C.begin(),
C.end());
563 return std::make_pair(
C.rbegin(),
C.rend());
565 assert(ChainBegin != ChainEnd);
567 std::vector<Chain> Chains;
570 for (
auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
571 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.
front().Inst,
573 LLVM_DEBUG(
dbgs() <<
"LSV: No intervening may-alias instrs; can merge "
574 << *ChainIt->Inst <<
" into " << *ChainBegin->Inst
579 dbgs() <<
"LSV: Found intervening may-alias instrs; cannot merge "
580 << *ChainIt->Inst <<
" into " << *ChainBegin->Inst <<
"\n");
581 if (NewChain.
size() > 1) {
583 dbgs() <<
"LSV: got nontrivial chain without aliasing instrs:\n";
586 Chains.emplace_back(std::move(NewChain));
593 if (NewChain.
size() > 1) {
595 dbgs() <<
"LSV: got nontrivial chain without aliasing instrs:\n";
598 Chains.emplace_back(std::move(NewChain));
603 if (isa<LoadInst>(
C[0].Inst))
604 return Impl(std::bool_constant<true>());
606 assert(isa<StoreInst>(
C[0].Inst));
607 return Impl(std::bool_constant<false>());
610std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &
C) {
614 sortChainInOffsetOrder(
C);
617 dbgs() <<
"LSV: splitChainByContiguity considering chain:\n";
621 std::vector<Chain>
Ret;
622 Ret.push_back({
C.front()});
624 for (
auto It = std::next(
C.begin()),
End =
C.end(); It !=
End; ++It) {
626 auto &CurChain =
Ret.back();
627 const ChainElem &Prev = CurChain.back();
629 assert(SzBits % 8 == 0 &&
"Non-byte sizes should have been filtered out by "
630 "collectEquivalenceClass");
631 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
634 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
636 << (AreContiguous ?
"" :
"not ") <<
"contiguous: "
637 << *Prev.Inst <<
" (ends at offset " << PrevReadEnd
638 <<
") -> " << *It->Inst <<
" (starts at offset "
639 << It->OffsetFromLeader <<
")\n");
641 CurChain.push_back(*It);
643 Ret.push_back({*It});
647 llvm::erase_if(Ret, [](
const auto &Chain) {
return Chain.size() <= 1; });
651Type *Vectorizer::getChainElemTy(
const Chain &
C) {
664 if (
any_of(
C, [](
const ChainElem &E) {
672 for (
const ChainElem &E :
C)
678std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &
C) {
691 sortChainInOffsetOrder(
C);
694 dbgs() <<
"LSV: splitChainByAlignment considering chain:\n";
698 bool IsLoadChain = isa<LoadInst>(
C[0].Inst);
699 auto GetVectorFactor = [&](
unsigned VF,
unsigned LoadStoreSize,
702 ChainSizeBytes, VecTy)
704 ChainSizeBytes, VecTy);
708 for (
const auto &E :
C) {
711 "Should have filtered out non-power-of-two elements in "
712 "collectEquivalenceClasses.");
719 std::vector<Chain>
Ret;
720 for (
unsigned CBegin = 0; CBegin <
C.size(); ++CBegin) {
725 for (
unsigned CEnd = CBegin + 1,
Size =
C.size(); CEnd <
Size; ++CEnd) {
726 APInt Sz =
C[CEnd].OffsetFromLeader +
728 C[CBegin].OffsetFromLeader;
729 if (Sz.
sgt(VecRegBytes))
731 CandidateChains.emplace_back(CEnd,
736 for (
auto It = CandidateChains.rbegin(),
End = CandidateChains.rend();
738 auto [CEnd, SizeBytes] = *It;
740 dbgs() <<
"LSV: splitChainByAlignment considering candidate chain ["
741 << *
C[CBegin].Inst <<
" ... " << *
C[CEnd].Inst <<
"]\n");
743 Type *VecElemTy = getChainElemTy(
C);
747 unsigned VecElemBits =
DL.getTypeSizeInBits(VecElemTy);
750 assert((8 * SizeBytes) % VecElemBits == 0);
751 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
753 unsigned VF = 8 * VecRegBytes / VecElemBits;
756 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
757 VecElemBits * NumVecElems / 8, VecTy);
758 if (TargetVF != VF && TargetVF < NumVecElems) {
760 dbgs() <<
"LSV: splitChainByAlignment discarding candidate chain "
762 << TargetVF <<
" != VF=" << VF
763 <<
" and TargetVF < NumVecElems=" << NumVecElems <<
"\n");
771 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &
TTI =
TTI,
773 if (Alignment.value() % SizeBytes == 0)
775 unsigned VectorizedSpeed = 0;
777 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
778 if (!AllowsMisaligned) {
780 <<
"LSV: Access of " << SizeBytes <<
"B in addrspace "
781 << AS <<
" with alignment " << Alignment.value()
782 <<
" is misaligned, and therefore can't be vectorized.\n");
786 unsigned ElementwiseSpeed = 0;
787 (
TTI).allowsMisalignedMemoryAccesses((
F).getContext(), VecElemBits, AS,
788 Alignment, &ElementwiseSpeed);
789 if (VectorizedSpeed < ElementwiseSpeed) {
791 <<
"LSV: Access of " << SizeBytes <<
"B in addrspace "
792 << AS <<
" with alignment " << Alignment.value()
793 <<
" has relative speed " << VectorizedSpeed
794 <<
", which is lower than the elementwise speed of "
796 <<
". Therefore this access won't be vectorized.\n");
812 bool IsAllocaAccess = AS ==
DL.getAllocaAddrSpace() &&
815 Align PrefAlign =
Align(StackAdjustedAlignment);
816 if (IsAllocaAccess && Alignment.
value() % SizeBytes != 0 &&
817 IsAllowedAndFast(PrefAlign)) {
819 PtrOperand, PrefAlign,
DL,
C[CBegin].Inst,
nullptr, &DT);
820 if (NewAlign >= Alignment) {
822 <<
"LSV: splitByChain upgrading alloca alignment from "
823 << Alignment.
value() <<
" to " << NewAlign.
value()
825 Alignment = NewAlign;
829 if (!IsAllowedAndFast(Alignment)) {
831 dbgs() <<
"LSV: splitChainByAlignment discarding candidate chain "
832 "because its alignment is not AllowedAndFast: "
833 << Alignment.
value() <<
"\n");
842 dbgs() <<
"LSV: splitChainByAlignment discarding candidate chain "
843 "because !isLegalToVectorizeLoad/StoreChain.");
848 Chain &NewChain =
Ret.emplace_back();
849 for (
unsigned I = CBegin;
I <= CEnd; ++
I)
850 NewChain.emplace_back(
C[
I]);
858bool Vectorizer::vectorizeChain(Chain &
C) {
862 sortChainInOffsetOrder(
C);
865 dbgs() <<
"LSV: Vectorizing chain of " <<
C.size() <<
" instructions:\n";
869 Type *VecElemTy = getChainElemTy(
C);
870 bool IsLoadChain = isa<LoadInst>(
C[0].Inst);
872 unsigned ChainBytes = std::accumulate(
873 C.begin(),
C.end(), 0u, [&](
unsigned Bytes,
const ChainElem &E) {
874 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
876 assert(ChainBytes %
DL.getTypeStoreSize(VecElemTy) == 0);
880 VecElemTy, 8 * ChainBytes /
DL.getTypeSizeInBits(VecElemTy));
885 if (AS ==
DL.getAllocaAddrSpace()) {
886 Alignment = std::max(
894 for (
const ChainElem &E :
C)
896 DL.getTypeStoreSize(VecElemTy));
903 Builder.SetInsertPoint(
905 return A.Inst->comesBefore(
B.Inst);
910 VecInst = Builder.CreateAlignedLoad(VecTy,
915 for (
const ChainElem &E :
C) {
919 if (
auto *VT = dyn_cast<FixedVectorType>(
T)) {
920 auto Mask = llvm::to_vector<8>(
921 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
922 V = Builder.CreateShuffleVector(VecInst, Mask,
I->getName());
923 VecIdx += VT->getNumElements();
925 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
929 if (
V->getType() !=
I->getType())
930 V = Builder.CreateBitOrPointerCast(V,
I->getType());
931 I->replaceAllUsesWith(V);
957 return A.Inst->comesBefore(
B.Inst);
963 auto InsertElem = [&](
Value *
V) {
964 if (
V->getType() != VecElemTy)
965 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
966 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
968 for (
const ChainElem &E :
C) {
969 auto *
I = cast<StoreInst>(E.Inst);
972 for (
int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
973 InsertElem(Builder.CreateExtractElement(
I->getValueOperand(),
974 Builder.getInt32(J)));
977 InsertElem(
I->getValueOperand());
983 VecInst = Builder.CreateAlignedStore(
991 for (
const ChainElem &E :
C)
992 ToErase.emplace_back(E.Inst);
994 ++NumVectorInstructions;
995 NumScalarsVectorized +=
C.size();
999template <
bool IsLoadChain>
1000bool Vectorizer::isSafeToMove(
1003 LLVM_DEBUG(
dbgs() <<
"LSV: isSafeToMove(" << *ChainElem <<
" -> "
1004 << *ChainBegin <<
")\n");
1006 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1007 if (ChainElem == ChainBegin)
1012 if (isInvariantLoad(ChainElem))
1015 auto BBIt = std::next([&] {
1016 if constexpr (IsLoadChain)
1021 auto BBItEnd = std::next([&] {
1022 if constexpr (IsLoadChain)
1028 const APInt &ChainElemOffset = ChainOffsets.
at(ChainElem);
1029 const unsigned ChainElemSize =
1032 for (; BBIt != BBItEnd; ++BBIt) {
1035 if (!
I->mayReadOrWriteMemory())
1039 if (IsLoadChain && isa<LoadInst>(
I))
1043 if (!IsLoadChain && isInvariantLoad(
I))
1052 if (
auto OffsetIt = ChainOffsets.
find(
I); OffsetIt != ChainOffsets.
end()) {
1059 const APInt &IOffset = OffsetIt->second;
1061 if (IOffset == ChainElemOffset ||
1062 (IOffset.
sle(ChainElemOffset) &&
1063 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1064 (ChainElemOffset.
sle(IOffset) &&
1065 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1071 dbgs() <<
"LSV: Found alias in chain: " << *
I <<
"\n";
1083 <<
" Aliasing instruction:\n"
1084 <<
" " << *
I <<
'\n'
1085 <<
" Aliased instruction and pointer:\n"
1086 <<
" " << *ChainElem <<
'\n'
1104 unsigned MatchingOpIdxB,
bool Signed) {
1105 LLVM_DEBUG(
dbgs() <<
"LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1106 <<
", AddOpA=" << *AddOpA <<
", MatchingOpIdxA="
1107 << MatchingOpIdxA <<
", AddOpB=" << *AddOpB
1108 <<
", MatchingOpIdxB=" << MatchingOpIdxB
1109 <<
", Signed=" <<
Signed <<
"\n");
1125 AddOpB->
getOpcode() == Instruction::Add &&
1129 Value *OtherOperandA = AddOpA->
getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1130 Value *OtherOperandB = AddOpB->
getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1131 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1132 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1134 if (OtherInstrB && OtherInstrB->
getOpcode() == Instruction::Add &&
1136 isa<ConstantInt>(OtherInstrB->
getOperand(1))) {
1138 cast<ConstantInt>(OtherInstrB->
getOperand(1))->getSExtValue();
1139 if (OtherInstrB->
getOperand(0) == OtherOperandA &&
1144 if (OtherInstrA && OtherInstrA->
getOpcode() == Instruction::Add &&
1146 isa<ConstantInt>(OtherInstrA->
getOperand(1))) {
1148 cast<ConstantInt>(OtherInstrA->
getOperand(1))->getSExtValue();
1149 if (OtherInstrA->
getOperand(0) == OtherOperandB &&
1155 if (OtherInstrA && OtherInstrB &&
1156 OtherInstrA->
getOpcode() == Instruction::Add &&
1157 OtherInstrB->
getOpcode() == Instruction::Add &&
1160 isa<ConstantInt>(OtherInstrA->
getOperand(1)) &&
1161 isa<ConstantInt>(OtherInstrB->
getOperand(1))) {
1163 cast<ConstantInt>(OtherInstrA->
getOperand(1))->getSExtValue();
1165 cast<ConstantInt>(OtherInstrB->
getOperand(1))->getSExtValue();
1174std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1176 LLVM_DEBUG(
dbgs() <<
"LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1177 <<
" PtrB=" << *PtrB <<
" ContextInst=" << *ContextInst
1178 <<
" Depth=" <<
Depth <<
"\n");
1179 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1180 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1182 return getConstantOffsetSelects(PtrA, PtrB, ContextInst,
Depth);
1186 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1187 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1188 return std::nullopt;
1191 for (
unsigned I = 0, E = GEPA->getNumIndices() - 1;
I < E; ++
I) {
1193 return std::nullopt;
1202 return std::nullopt;
1207 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1208 return std::nullopt;
1210 bool Signed = isa<SExtInst>(OpA);
1214 OpB = dyn_cast<Instruction>(OpB->
getOperand(0));
1216 return std::nullopt;
1222 return std::nullopt;
1226 return std::nullopt;
1229 LLVM_DEBUG(
dbgs() <<
"LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1237 if (OpB->
getOpcode() == Instruction::Add &&
1239 IdxDiff.
sle(cast<ConstantInt>(OpB->
getOperand(1))->getSExtValue()) &&
1245 OpA = dyn_cast<Instruction>(ValA);
1246 if (!Safe && OpA && OpA->
getOpcode() == Instruction::Add &&
1252 for (
unsigned MatchingOpIdxA : {0, 1})
1253 for (
unsigned MatchingOpIdxB : {0, 1})
1277 if (BitsAllowedToBeSet.
ult(IdxDiff.
abs()))
1278 return std::nullopt;
1283 return IdxDiff * Stride;
1284 return std::nullopt;
1287std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1290 return std::nullopt;
1292 if (
auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1293 if (
auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1294 if (SelectA->getCondition() != SelectB->getCondition())
1295 return std::nullopt;
1296 LLVM_DEBUG(
dbgs() <<
"LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1297 <<
", PtrB=" << *PtrB <<
", ContextInst="
1298 << *ContextInst <<
", Depth=" <<
Depth <<
"\n");
1299 std::optional<APInt> TrueDiff = getConstantOffset(
1300 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst,
Depth);
1302 return std::nullopt;
1303 std::optional<APInt> FalseDiff =
1304 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1305 ContextInst,
Depth);
1306 if (TrueDiff == FalseDiff)
1310 return std::nullopt;
1313void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses)
const {
1314 if (EQClasses.size() < 2)
1319 static_assert(std::tuple_size_v<EqClassKey> == 4,
1320 "EqClassKey has changed - EqClassReducedKey needs changes too");
1321 using EqClassReducedKey =
1322 std::tuple<std::tuple_element_t<1, EqClassKey> ,
1323 std::tuple_element_t<2, EqClassKey> ,
1324 std::tuple_element_t<3, EqClassKey> >;
1325 using ECReducedKeyToUnderlyingObjectMap =
1332 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1333 bool FoundPotentiallyOptimizableEC =
false;
1334 for (
const auto &EC : EQClasses) {
1335 const auto &
Key =
EC.first;
1336 EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1338 RedKeyToUOMap[RedKey].
insert(std::get<0>(Key));
1339 if (RedKeyToUOMap[RedKey].
size() > 1)
1340 FoundPotentiallyOptimizableEC =
true;
1342 if (!FoundPotentiallyOptimizableEC)
1346 dbgs() <<
"LSV: mergeEquivalenceClasses: before merging:\n";
1347 for (
const auto &EC : EQClasses) {
1348 dbgs() <<
" Key: {" <<
EC.first <<
"}\n";
1349 for (
const auto &Inst :
EC.second)
1350 dbgs() <<
" Inst: " << *Inst <<
'\n';
1354 dbgs() <<
"LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1355 for (
const auto &RedKeyToUO : RedKeyToUOMap) {
1356 dbgs() <<
" Reduced key: {" << std::get<0>(RedKeyToUO.first) <<
", "
1357 << std::get<1>(RedKeyToUO.first) <<
", "
1358 <<
static_cast<int>(std::get<2>(RedKeyToUO.first)) <<
"} --> "
1359 << RedKeyToUO.second.size() <<
" underlying objects:\n";
1360 for (
auto UObject : RedKeyToUO.second)
1361 dbgs() <<
" " << *UObject <<
'\n';
1368 auto GetUltimateTargets =
1370 UObjectToUObjectMap IndirectionMap;
1371 for (
const auto *UObject : UObjects) {
1372 const unsigned MaxLookupDepth = 1;
1374 if (UltimateTarget != UObject)
1375 IndirectionMap[UObject] = UltimateTarget;
1377 UObjectToUObjectMap UltimateTargetsMap;
1378 for (
const auto *UObject : UObjects) {
1380 auto It = IndirectionMap.find(
Target);
1381 for (; It != IndirectionMap.end(); It = IndirectionMap.find(
Target))
1383 UltimateTargetsMap[UObject] =
Target;
1385 return UltimateTargetsMap;
1390 for (
auto &[RedKey, UObjects] : RedKeyToUOMap) {
1391 if (UObjects.size() < 2)
1393 auto UTMap = GetUltimateTargets(UObjects);
1394 for (
const auto &[UObject, UltimateTarget] : UTMap) {
1395 if (UObject == UltimateTarget)
1398 EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1399 std::get<2>(RedKey)};
1400 EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1401 std::get<2>(RedKey)};
1404 const auto &VecTo = EQClasses[KeyTo];
1405 const auto &VecFrom = EQClasses[KeyFrom];
1407 std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1408 std::back_inserter(MergedVec),
1410 return A && B && A->comesBefore(B);
1412 EQClasses[KeyTo] = std::move(MergedVec);
1413 EQClasses.erase(KeyFrom);
1417 dbgs() <<
"LSV: mergeEquivalenceClasses: after merging:\n";
1418 for (
const auto &EC : EQClasses) {
1419 dbgs() <<
" Key: {" <<
EC.first <<
"}\n";
1420 for (
const auto &Inst :
EC.second)
1421 dbgs() <<
" Inst: " << *Inst <<
'\n';
1429 EquivalenceClassMap
Ret;
1431 auto GetUnderlyingObject = [](
const Value *
Ptr) ->
const Value * {
1433 if (
const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1440 return Sel->getCondition();
1446 auto *LI = dyn_cast<LoadInst>(&
I);
1447 auto *
SI = dyn_cast<StoreInst>(&
I);
1451 if ((LI && !LI->
isSimple()) || (SI && !
SI->isSimple()))
1464 unsigned TySize =
DL.getTypeSizeInBits(Ty);
1465 if ((TySize % 8) != 0)
1476 unsigned AS =
Ptr->getType()->getPointerAddressSpace();
1479 unsigned VF = VecRegSize / TySize;
1480 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1484 (VecTy && !
isPowerOf2_32(
DL.getTypeSizeInBits(VecTy->getScalarType()))))
1488 if (TySize > VecRegSize / 2 ||
1492 Ret[{GetUnderlyingObject(
Ptr), AS,
1498 mergeEquivalenceClasses(Ret);
1507 unsigned ASPtrBits =
DL.getIndexSizeInBits(AS);
1511 for (
size_t I = 1;
I < Instrs.
size(); ++
I) {
1512 assert(Instrs[
I - 1]->comesBefore(Instrs[
I]));
1521 struct InstrListElem :
ilist_node<InstrListElem>,
1522 std::pair<Instruction *, Chain> {
1526 struct InstrListElemDenseMapInfo {
1529 static InstrListElem *getEmptyKey() {
return PtrInfo::getEmptyKey(); }
1530 static InstrListElem *getTombstoneKey() {
1531 return PtrInfo::getTombstoneKey();
1533 static unsigned getHashValue(
const InstrListElem *
E) {
1534 return IInfo::getHashValue(
E->first);
1536 static bool isEqual(
const InstrListElem *
A,
const InstrListElem *
B) {
1537 if (
A == getEmptyKey() ||
B == getEmptyKey())
1538 return A == getEmptyKey() &&
B == getEmptyKey();
1539 if (
A == getTombstoneKey() ||
B == getTombstoneKey())
1540 return A == getTombstoneKey() &&
B == getTombstoneKey();
1541 return IInfo::isEqual(
A->first,
B->first);
1552 constexpr int MaxChainsToTry = 64;
1554 bool MatchFound =
false;
1555 auto ChainIter = MRU.
begin();
1556 for (
size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.
end();
1558 if (std::optional<APInt>
Offset = getConstantOffset(
1562 (ChainIter->first->comesBefore(
I) ?
I : ChainIter->first))) {
1565 ChainIter->second.emplace_back(
I,
Offset.value());
1575 APInt ZeroOffset(ASPtrBits, 0);
1576 InstrListElem *E =
new (
Allocator.Allocate()) InstrListElem(
I);
1577 E->second.emplace_back(
I, ZeroOffset);
1583 std::vector<Chain>
Ret;
1587 if (E.second.size() > 1)
1588 Ret.emplace_back(std::move(E.second));
1592std::optional<APInt> Vectorizer::getConstantOffset(
Value *PtrA,
Value *PtrB,
1596 <<
", PtrB=" << *PtrB <<
", ContextInst= " << *ContextInst
1597 <<
", Depth=" <<
Depth <<
"\n");
1600 unsigned OrigBitWidth =
DL.getIndexTypeSizeInBits(PtrA->
getType());
1601 APInt OffsetA(OrigBitWidth, 0);
1602 APInt OffsetB(OrigBitWidth, 0);
1605 unsigned NewPtrBitWidth =
DL.getTypeStoreSizeInBits(PtrA->
getType());
1606 if (NewPtrBitWidth !=
DL.getTypeStoreSizeInBits(PtrB->
getType()))
1607 return std::nullopt;
1612 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1613 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1615 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1616 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1618 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1623 LLVM_DEBUG(
dbgs() <<
"LSV: SCEV PtrB - PtrA =" << *DistScev <<
"\n");
1629 return (OffsetB - OffsetA + Dist).
sextOrTrunc(OrigBitWidth);
1632 if (std::optional<APInt> Diff =
1633 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst,
Depth))
1634 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1636 return std::nullopt;
AMDGPU Mark last scratch load
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
Module.h This file contains the declarations for the Module class.
static const unsigned MaxDepth
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To)
Check if it's safe to move From down to To, checking that no physical registers are clobbered.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
APInt zext(unsigned width) const
Zero extend to a new width.
APInt abs() const
Get the absolute value.
bool sgt(const APInt &RHS) const
Signed greater than comparison.
unsigned getBitWidth() const
Return the number of bits in the APInt.
bool ult(const APInt &RHS) const
Unsigned less than comparison.
bool sle(const APInt &RHS) const
Signed less or equal comparison.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
int64_t getSExtValue() const
Get sign extended value.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
This class represents a range of values.
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
A parsed version of the target data layout string in and methods for querying it.
iterator find(const_arg_type_t< KeyT > Val)
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Class to represent fixed width SIMD vectors.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
Legacy wrapper pass to provide the GlobalsAAResult object.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
An instruction for reading from memory.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
Legacy wrapper pass to provide the SCEVAAResult object.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getCouldNotCompute()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Target - Wrapper for Target specific information.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isPointerTy() const
True if this is an instance of PointerType.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
std::pair< iterator, bool > insert(const ValueT &V)
TypeSize getSequentialElementStride(const DataLayout &DL) const
Value * getOperand() const
const ParentTy * getParent() const
This class implements an extremely fast bulk output stream that can only output to a stream.
A simple intrusive list implementation.
void push_front(reference Node)
Insert a node at the front; never copies.
void remove(reference N)
Remove a node by reference; never deletes.
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< po_iterator< T > > post_order(const T &G)
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
bool isModSet(const ModRefInfo MRI)
void sort(IteratorTy Start, IteratorTy End)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isModOrRefSet(const ModRefInfo MRI)
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
constexpr unsigned BitWidth
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
Implement std::hash so that hash_code can be used in STL containers.
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
An information struct used to provide DenseMap with the various necessary components for a given valu...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.