72#ifdef EXPENSIVE_CHECKS
104using namespace slpvectorizer;
106#define SV_NAME "slp-vectorizer"
107#define DEBUG_TYPE "SLP"
109STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
112 cl::desc(
"Run the SLP vectorization passes"));
116 cl::desc(
"Only vectorize if you gain more than this "
121 cl::desc(
"Attempt to vectorize horizontal reductions"));
126 "Attempt to vectorize horizontal reductions feeding into a store"));
132 cl::desc(
"Allow optimization of original scalar identity operations on "
133 "matched horizontal reductions."));
137 cl::desc(
"Attempt to vectorize for this register size in bits"));
141 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
149 cl::desc(
"Limit the size of the SLP scheduling region per block"));
153 cl::desc(
"Attempt to vectorize for this register size in bits"));
157 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
161 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
167 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
176 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
180 cl::desc(
"Display the SLP trees with Graphviz"));
203 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
210 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
217 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
218 !isa<ExtractValueInst, UndefValue>(V))
220 auto *
I = dyn_cast<Instruction>(V);
221 if (!
I || isa<ExtractValueInst>(
I))
223 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
225 if (isa<ExtractElementInst>(
I))
227 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
236 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
252 for (
int I = 1,
E = VL.
size();
I <
E;
I++) {
253 auto *II = dyn_cast<Instruction>(VL[
I]);
274 Value *FirstNonUndef =
nullptr;
275 for (
Value *V : VL) {
276 if (isa<UndefValue>(V))
278 if (!FirstNonUndef) {
282 if (V != FirstNonUndef)
285 return FirstNonUndef !=
nullptr;
290 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
291 return Cmp->isCommutative();
292 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
293 return BO->isCommutative();
305 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
306 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
309 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
312 if (CI->getValue().uge(VT->getNumElements()))
314 Index *= VT->getNumElements();
315 Index += CI->getZExtValue();
319 const auto *
IV = cast<InsertValueInst>(InsertInst);
320 Type *CurrentType =
IV->getType();
321 for (
unsigned I :
IV->indices()) {
322 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
323 Index *= ST->getNumElements();
324 CurrentType = ST->getElementType(
I);
325 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
326 Index *= AT->getNumElements();
327 CurrentType = AT->getElementType();
360 if (MaskArg == UseMask::UndefsAsMask)
364 if (MaskArg == UseMask::FirstArg &&
Value < VF)
365 UseMask.reset(
Value);
366 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
367 UseMask.reset(
Value - VF);
375template <
bool IsPoisonOnly = false>
379 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
382 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
385 auto *
C = dyn_cast<Constant>(V);
387 if (!UseMask.empty()) {
389 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
390 Base = II->getOperand(0);
391 if (isa<T>(II->getOperand(1)))
396 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
404 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
411 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
412 if (
Constant *Elem =
C->getAggregateElement(
I))
414 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
462static std::optional<TargetTransformInfo::ShuffleKind>
465 find_if(VL, [](
Value *V) {
return isa<ExtractElementInst>(V); });
468 auto *EI0 = cast<ExtractElementInst>(*It);
469 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
472 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
473 Value *Vec1 =
nullptr;
474 Value *Vec2 =
nullptr;
476 ShuffleMode CommonShuffleMode =
Unknown;
478 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
480 if (isa<UndefValue>(VL[
I]))
482 auto *EI = cast<ExtractElementInst>(VL[
I]);
483 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
485 auto *Vec = EI->getVectorOperand();
490 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
492 if (isa<UndefValue>(EI->getIndexOperand()))
494 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
500 unsigned IntIdx =
Idx->getValue().getZExtValue();
504 if (!Vec1 || Vec1 == Vec) {
506 }
else if (!Vec2 || Vec2 == Vec) {
512 if (CommonShuffleMode == Permute)
517 CommonShuffleMode = Permute;
520 CommonShuffleMode =
Select;
523 if (CommonShuffleMode ==
Select && Vec2)
533 unsigned Opcode =
E->getOpcode();
534 assert((Opcode == Instruction::ExtractElement ||
535 Opcode == Instruction::ExtractValue) &&
536 "Expected extractelement or extractvalue instruction.");
537 if (Opcode == Instruction::ExtractElement) {
538 auto *CI = dyn_cast<ConstantInt>(
E->getOperand(1));
541 return CI->getZExtValue();
543 auto *EI = cast<ExtractValueInst>(
E);
544 if (EI->getNumIndices() != 1)
546 return *EI->idx_begin();
554static std::optional<TTI::ShuffleKind>
561 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
562 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
564 if (isa<UndefValue>(VL[
I]))
568 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
569 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
583 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
587 for (
const auto &
Data : VectorOpToIdx)
588 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
589 .push_back(
Data.first);
590 for (
auto &
Data : VFToVector) {
592 return VectorOpToIdx.find(V1)->second.size() >
593 VectorOpToIdx.find(V2)->second.size();
598 const int UndefSz = UndefVectorExtracts.
size();
599 unsigned SingleMax = 0;
600 Value *SingleVec =
nullptr;
601 unsigned PairMax = 0;
602 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
603 for (
auto &
Data : VFToVector) {
605 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
606 SingleMax = VectorOpToIdx[V1].
size() + UndefSz;
610 if (
Data.second.size() > 1)
611 V2 = *std::next(
Data.second.begin());
612 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
614 PairMax = VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() + UndefSz;
615 PairVec = std::make_pair(V1, V2);
618 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
625 if (SingleMax >= PairMax && SingleMax) {
626 for (
int Idx : VectorOpToIdx[SingleVec])
629 for (
Value *V : {PairVec.first, PairVec.second})
630 for (
int Idx : VectorOpToIdx[V])
634 for (
int Idx : UndefVectorExtracts)
638 std::optional<TTI::ShuffleKind> Res =
648 for (
int I = 0,
E = GatheredExtracts.
size();
I <
E; ++
I) {
649 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
650 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
651 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
663struct InstructionsState {
665 Value *OpValue =
nullptr;
676 unsigned getAltOpcode()
const {
681 bool isAltShuffle()
const {
return AltOp != MainOp; }
684 unsigned CheckedOpcode =
I->getOpcode();
685 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
688 InstructionsState() =
delete;
690 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
699 auto *
I = dyn_cast<Instruction>(
Op);
700 if (
I && S.isOpcodeOrAlt(
I))
719 unsigned BaseIndex = 0);
727 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
728 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
729 BaseOp0 == Op0 || BaseOp1 == Op1 ||
740 "Assessing comparisons of different types?");
750 return (BasePred == Pred &&
752 (BasePred == SwappedPred &&
761 unsigned BaseIndex) {
764 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
766 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
767 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
768 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
770 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
772 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
773 unsigned AltOpcode = Opcode;
774 unsigned AltIndex = BaseIndex;
778 auto *IBase = cast<Instruction>(VL[BaseIndex]);
781 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
785 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
787 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
788 auto *
I = cast<Instruction>(VL[Cnt]);
789 unsigned InstOpcode =
I->getOpcode();
790 if (IsBinOp && isa<BinaryOperator>(
I)) {
791 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
795 AltOpcode = InstOpcode;
799 }
else if (IsCastOp && isa<CastInst>(
I)) {
800 Value *Op0 = IBase->getOperand(0);
802 Value *Op1 =
I->getOperand(0);
805 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
807 if (Opcode == AltOpcode) {
810 "Cast isn't safe for alternation, logic needs to be updated!");
811 AltOpcode = InstOpcode;
816 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
817 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
818 Type *Ty0 = BaseInst->getOperand(0)->getType();
819 Type *Ty1 = Inst->getOperand(0)->getType();
821 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
829 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
834 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
835 if (AltIndex != BaseIndex) {
838 }
else if (BasePred != CurrentPred) {
841 "CmpInst isn't safe for alternation, logic needs to be updated!");
846 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
847 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
850 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
851 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
852 if (Gep->getNumOperands() != 2 ||
853 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
854 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
855 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
857 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
858 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
859 auto *BaseLI = cast<LoadInst>(IBase);
860 if (!LI->isSimple() || !BaseLI->isSimple())
861 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
862 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
863 auto *
CallBase = cast<CallInst>(IBase);
865 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
866 if (Call->hasOperandBundles() &&
867 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
868 Call->op_begin() + Call->getBundleOperandsEndIndex(),
871 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
874 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
877 if (Mappings.
size() != BaseMappings.
size() ||
878 Mappings.
front().ISA != BaseMappings.
front().ISA ||
879 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
880 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
881 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
882 Mappings.
front().Shape.Parameters !=
883 BaseMappings.
front().Shape.Parameters)
884 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
889 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
892 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
893 cast<Instruction>(VL[AltIndex]));
909 case Instruction::Load: {
910 LoadInst *LI = cast<LoadInst>(UserInst);
913 case Instruction::Store: {
914 StoreInst *SI = cast<StoreInst>(UserInst);
915 return (SI->getPointerOperand() == Scalar);
917 case Instruction::Call: {
918 CallInst *CI = cast<CallInst>(UserInst);
921 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
922 Arg.value().get() == Scalar;
934 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
941 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
942 return LI->isSimple();
944 return SI->isSimple();
946 return !
MI->isVolatile();
954 bool ExtendingManyInputs =
false) {
958 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
960 (SubMask.
size() == Mask.size() &&
961 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
962 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
963 "SubMask with many inputs support must be larger than the mask.");
965 Mask.append(SubMask.
begin(), SubMask.
end());
969 int TermValue = std::min(Mask.size(), SubMask.
size());
970 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
972 (!ExtendingManyInputs &&
973 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
975 NewMask[
I] = Mask[SubMask[
I]];
991 const unsigned Sz = Order.
size();
994 for (
unsigned I = 0;
I < Sz; ++
I) {
996 UnusedIndices.
reset(Order[
I]);
998 MaskedIndices.
set(
I);
1000 if (MaskedIndices.
none())
1003 "Non-synced masked/available indices.");
1007 assert(
Idx >= 0 &&
"Indices must be synced.");
1019 const unsigned E = Indices.
size();
1021 for (
unsigned I = 0;
I <
E; ++
I)
1022 Mask[Indices[
I]] =
I;
1028 assert(!Mask.empty() &&
"Expected non-empty mask.");
1032 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1034 Scalars[Mask[
I]] = Prev[
I];
1042 auto *
I = dyn_cast<Instruction>(V);
1047 auto *IO = dyn_cast<Instruction>(V);
1050 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1059 auto *
I = dyn_cast<Instruction>(V);
1063 constexpr int UsesLimit = 8;
1064 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(UsesLimit) &&
1066 auto *IU = dyn_cast<Instruction>(U);
1069 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1085 return !VL.
empty() &&
1089namespace slpvectorizer {
1094 struct ScheduleData;
1111 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li),
1112 DT(Dt), AC(AC), DB(DB),
DL(
DL), ORE(ORE), Builder(Se->getContext()) {
1165 return !VectorizableTree.
empty() &&
1166 !VectorizableTree.
front()->UserTreeIndices.empty();
1171 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1172 return VectorizableTree.
front()->Scalars;
1184 VectorizableTree.
clear();
1185 ScalarToTreeEntry.clear();
1186 MultiNodeScalars.clear();
1188 EntryToLastInstruction.clear();
1189 ExternalUses.
clear();
1190 for (
auto &Iter : BlocksSchedules) {
1191 BlockScheduling *BS = Iter.second.get();
1195 InstrElementSize.clear();
1196 UserIgnoreList =
nullptr;
1197 PostponedGathers.
clear();
1198 ValueToGatherNodes.
clear();
1255 return MaxVecRegSize;
1260 return MinVecRegSize;
1268 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1270 return MaxVF ? MaxVF : UINT_MAX;
1325 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1326 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1348 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1349 MaxLevel(MaxLevel) {}
1403 if (isa<LoadInst>(V1)) {
1405 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1407 static constexpr unsigned Limit = 8;
1408 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1411 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1413 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1416 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1419 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1421 ((
int)V1->getNumUses() == NumLanes ||
1422 AllUsersAreInternal(V1, V2)))
1428 auto *LI1 = dyn_cast<LoadInst>(V1);
1429 auto *LI2 = dyn_cast<LoadInst>(V2);
1431 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1436 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1437 LI2->getPointerOperand(),
DL, SE,
true);
1438 if (!Dist || *Dist == 0) {
1441 R.TTI->isLegalMaskedGather(
1449 if (std::abs(*Dist) > NumLanes / 2)
1458 auto *C1 = dyn_cast<Constant>(V1);
1459 auto *C2 = dyn_cast<Constant>(V2);
1473 if (isa<UndefValue>(V2))
1477 Value *EV2 =
nullptr;
1490 int Dist = Idx2 - Idx1;
1493 if (std::abs(Dist) == 0)
1495 if (std::abs(Dist) > NumLanes / 2)
1505 auto *I1 = dyn_cast<Instruction>(V1);
1506 auto *I2 = dyn_cast<Instruction>(V2);
1508 if (I1->getParent() != I2->getParent())
1516 if (S.getOpcode() &&
1517 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1518 !S.isAltShuffle()) &&
1520 return cast<Instruction>(V)->getNumOperands() ==
1521 S.MainOp->getNumOperands();
1527 if (isa<UndefValue>(V2))
1564 int ShallowScoreAtThisLevel =
1573 auto *I1 = dyn_cast<Instruction>(
LHS);
1574 auto *I2 = dyn_cast<Instruction>(
RHS);
1575 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1577 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1578 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1579 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1580 ShallowScoreAtThisLevel))
1581 return ShallowScoreAtThisLevel;
1582 assert(I1 && I2 &&
"Should have early exited.");
1589 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1590 OpIdx1 != NumOperands1; ++OpIdx1) {
1592 int MaxTmpScore = 0;
1593 unsigned MaxOpIdx2 = 0;
1594 bool FoundBest =
false;
1598 ? I2->getNumOperands()
1599 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1600 assert(FromIdx <= ToIdx &&
"Bad index");
1601 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1603 if (Op2Used.
count(OpIdx2))
1608 I1, I2, CurrLevel + 1, std::nullopt);
1611 TmpScore > MaxTmpScore) {
1612 MaxTmpScore = TmpScore;
1619 Op2Used.
insert(MaxOpIdx2);
1620 ShallowScoreAtThisLevel += MaxTmpScore;
1623 return ShallowScoreAtThisLevel;
1654 struct OperandData {
1655 OperandData() =
default;
1656 OperandData(
Value *V,
bool APO,
bool IsUsed)
1657 : V(V), APO(APO), IsUsed(IsUsed) {}
1667 bool IsUsed =
false;
1676 enum class ReorderingMode {
1695 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1696 return OpsVec[OpIdx][Lane];
1700 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1701 return OpsVec[OpIdx][Lane];
1706 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1707 OpIdx != NumOperands; ++OpIdx)
1708 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1710 OpsVec[OpIdx][Lane].IsUsed =
false;
1714 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1715 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1727 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1728 Value *IdxLaneV = getData(
Idx, Lane).V;
1729 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1732 for (
unsigned Ln = 0,
E = getNumLanes(); Ln <
E; ++Ln) {
1735 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1736 if (!isa<Instruction>(OpIdxLnV))
1738 Uniques.
insert(OpIdxLnV);
1740 int UniquesCount = Uniques.
size();
1741 int UniquesCntWithIdxLaneV =
1742 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1743 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1744 int UniquesCntWithOpIdxLaneV =
1745 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1746 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1749 UniquesCntWithOpIdxLaneV) -
1750 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1759 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1760 Value *IdxLaneV = getData(
Idx, Lane).V;
1761 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1770 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1771 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1773 return R.areAllUsersVectorized(IdxLaneI)
1781 static const int ScoreScaleFactor = 10;
1789 int Lane,
unsigned OpIdx,
unsigned Idx,
1799 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1800 if (Score <= -SplatScore) {
1805 Score += SplatScore;
1811 Score *= ScoreScaleFactor;
1812 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1830 std::optional<unsigned>
1831 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1834 unsigned NumOperands = getNumOperands();
1837 Value *OpLastLane = getData(OpIdx, LastLane).V;
1840 ReorderingMode RMode = ReorderingModes[OpIdx];
1841 if (RMode == ReorderingMode::Failed)
1842 return std::nullopt;
1845 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1851 std::optional<unsigned>
Idx;
1855 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1862 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1864 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1866 OperandData &OpData = getData(
Idx, Lane);
1868 bool OpAPO = OpData.APO;
1877 if (OpAPO != OpIdxAPO)
1882 case ReorderingMode::Load:
1883 case ReorderingMode::Constant:
1884 case ReorderingMode::Opcode: {
1885 bool LeftToRight = Lane > LastLane;
1886 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1887 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1888 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1889 OpIdx,
Idx, IsUsed);
1890 if (Score >
static_cast<int>(BestOp.Score)) {
1892 BestOp.Score = Score;
1893 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1897 case ReorderingMode::Splat:
1898 if (
Op == OpLastLane)
1901 case ReorderingMode::Failed:
1907 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1911 return std::nullopt;
1918 unsigned getBestLaneToStartReordering()
const {
1919 unsigned Min = UINT_MAX;
1920 unsigned SameOpNumber = 0;
1931 for (
int I = getNumLanes();
I > 0; --
I) {
1932 unsigned Lane =
I - 1;
1933 OperandsOrderData NumFreeOpsHash =
1934 getMaxNumOperandsThatCanBeReordered(Lane);
1937 if (NumFreeOpsHash.NumOfAPOs < Min) {
1938 Min = NumFreeOpsHash.NumOfAPOs;
1939 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1942 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1943 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1946 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1947 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1948 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1949 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1950 auto It = HashMap.
find(NumFreeOpsHash.Hash);
1951 if (It == HashMap.
end())
1952 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1958 unsigned BestLane = 0;
1959 unsigned CntMin = UINT_MAX;
1961 if (
Data.second.first < CntMin) {
1962 CntMin =
Data.second.first;
1963 BestLane =
Data.second.second;
1970 struct OperandsOrderData {
1973 unsigned NumOfAPOs = UINT_MAX;
1976 unsigned NumOpsWithSameOpcodeParent = 0;
1990 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1991 unsigned CntTrue = 0;
1992 unsigned NumOperands = getNumOperands();
2002 bool AllUndefs =
true;
2003 unsigned NumOpsWithSameOpcodeParent = 0;
2007 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2008 const OperandData &OpData = getData(OpIdx, Lane);
2013 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2015 I->getParent() != Parent) {
2016 if (NumOpsWithSameOpcodeParent == 0) {
2017 NumOpsWithSameOpcodeParent = 1;
2019 Parent =
I->getParent();
2021 --NumOpsWithSameOpcodeParent;
2024 ++NumOpsWithSameOpcodeParent;
2028 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2029 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2033 OperandsOrderData
Data;
2034 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2035 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2043 assert((empty() || VL.
size() == getNumLanes()) &&
2044 "Expected same number of lanes");
2045 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2046 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2047 OpsVec.
resize(NumOperands);
2048 unsigned NumLanes = VL.
size();
2049 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2050 OpsVec[OpIdx].
resize(NumLanes);
2051 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2052 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2063 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2064 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2065 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2072 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2075 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2078 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2079 return getData(OpIdx, Lane).V;
2083 bool empty()
const {
return OpsVec.
empty(); }
2086 void clear() { OpsVec.
clear(); }
2091 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2092 bool OpAPO = getData(OpIdx, Lane).APO;
2093 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2097 bool FoundCandidate =
false;
2098 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2099 OperandData &
Data = getData(OpI, Ln);
2100 if (
Data.APO != OpAPO ||
Data.IsUsed)
2103 FoundCandidate =
true;
2108 if (!FoundCandidate)
2118 : TLI(TLI),
DL(
DL), SE(SE), R(R) {
2120 appendOperandsOfVL(RootVL);
2127 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2128 "Expected same num of lanes across all operands");
2129 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2130 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2138 unsigned NumOperands = getNumOperands();
2139 unsigned NumLanes = getNumLanes();
2159 unsigned FirstLane = getBestLaneToStartReordering();
2162 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2163 Value *OpLane0 = getValue(OpIdx, FirstLane);
2166 if (isa<LoadInst>(OpLane0))
2167 ReorderingModes[OpIdx] = ReorderingMode::Load;
2168 else if (isa<Instruction>(OpLane0)) {
2170 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2171 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2173 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2175 else if (isa<Constant>(OpLane0))
2176 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2177 else if (isa<Argument>(OpLane0))
2179 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2182 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2189 auto &&SkipReordering = [
this]() {
2192 for (
const OperandData &
Data : Op0)
2195 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2214 if (SkipReordering())
2217 bool StrategyFailed =
false;
2225 for (
unsigned I = 0;
I < NumOperands; ++
I)
2226 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2228 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2231 int Lane = FirstLane +
Direction * Distance;
2232 if (Lane < 0 || Lane >= (
int)NumLanes)
2235 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2238 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2240 std::optional<unsigned> BestIdx = getBestOperand(
2241 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2248 swap(OpIdx, *BestIdx, Lane);
2251 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2253 StrategyFailed =
true;
2256 if (MainAltOps[OpIdx].
size() != 2) {
2257 OperandData &AltOp = getData(OpIdx, Lane);
2258 InstructionsState OpS =
2260 if (OpS.getOpcode() && OpS.isAltShuffle())
2267 if (!StrategyFailed)
2272#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2275 case ReorderingMode::Load:
2277 case ReorderingMode::Opcode:
2279 case ReorderingMode::Constant:
2281 case ReorderingMode::Splat:
2283 case ReorderingMode::Failed:
2304 const unsigned Indent = 2;
2307 OS <<
"Operand " << Cnt++ <<
"\n";
2308 for (
const OperandData &OpData : OpDataVec) {
2310 if (
Value *V = OpData.V)
2314 OS <<
", APO:" << OpData.APO <<
"}\n";
2336 int BestScore = Limit;
2337 std::optional<int>
Index;
2338 for (
int I : seq<int>(0, Candidates.size())) {
2340 Candidates[
I].second,
2343 if (Score > BestScore) {
2358 DeletedInstructions.insert(
I);
2364 return AnalyzedReductionsRoots.count(
I);
2369 AnalyzedReductionsRoots.insert(
I);
2383 AnalyzedReductionsRoots.clear();
2384 AnalyzedReductionVals.
clear();
2405 canReorderOperands(TreeEntry *UserTE,
2412 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2416 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2418 TreeEntry *TE =
nullptr;
2420 TE = getTreeEntry(V);
2421 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2423 auto It = MultiNodeScalars.find(V);
2424 if (It != MultiNodeScalars.end()) {
2425 for (TreeEntry *
E : It->second) {
2426 if (
is_contained(
E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2434 if (It != VL.
end()) {
2435 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2443 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2444 unsigned OpIdx)
const {
2445 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2446 const_cast<TreeEntry *
>(UserTE), OpIdx);
2450 bool areAllUsersVectorized(
2465 const EdgeInfo &EI);
2476 bool ResizeAllowed =
false)
const;
2483 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
2488 template <
typename BVTy,
typename ResTy,
typename...
Args>
2489 ResTy processBuildVector(
const TreeEntry *
E, Args &...Params);
2494 Value *createBuildVector(
const TreeEntry *
E);
2500 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
2509 std::optional<TargetTransformInfo::ShuffleKind>
2522 void setInsertPointAfterBundle(
const TreeEntry *
E);
2530 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2534 static void reorderInputsAccordingToOpcode(
2543 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2559 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2563 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2580 [Scalars](
Value *V,
int Idx) {
2581 return (isa<UndefValue>(V) &&
2582 Idx == PoisonMaskElem) ||
2583 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2586 if (!ReorderIndices.empty()) {
2593 return IsSame(Scalars, Mask);
2594 if (VL.
size() == ReuseShuffleIndices.size()) {
2596 return IsSame(Scalars, Mask);
2600 return IsSame(Scalars, ReuseShuffleIndices);
2603 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2604 return State == TreeEntry::NeedToGather &&
2605 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2606 UserTreeIndices.front().UserTE == UserEI.UserTE;
2610 bool hasEqualOperands(
const TreeEntry &TE)
const {
2611 if (
TE.getNumOperands() != getNumOperands())
2614 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
2615 unsigned PrevCount =
Used.count();
2616 for (
unsigned K = 0;
K <
E; ++
K) {
2619 if (getOperand(K) ==
TE.getOperand(
I)) {
2625 if (PrevCount ==
Used.count())
2634 unsigned getVectorFactor()
const {
2635 if (!ReuseShuffleIndices.empty())
2636 return ReuseShuffleIndices.size();
2637 return Scalars.
size();
2652 PossibleStridedVectorize,
2669 VecTreeTy &Container;
2693 assert(Operands[OpIdx].empty() &&
"Already resized?");
2695 "Number of operands is greater than the number of scalars.");
2701 void setOperandsInOrder() {
2703 auto *I0 = cast<Instruction>(Scalars[0]);
2704 Operands.resize(I0->getNumOperands());
2705 unsigned NumLanes = Scalars.size();
2706 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2707 OpIdx != NumOperands; ++OpIdx) {
2709 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2710 auto *
I = cast<Instruction>(Scalars[Lane]);
2711 assert(
I->getNumOperands() == NumOperands &&
2712 "Expected same number of operands");
2713 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2737 unsigned getNumOperands()
const {
return Operands.size(); }
2740 Value *getSingleOperand(
unsigned OpIdx)
const {
2742 assert(!Operands[OpIdx].empty() &&
"No operand available");
2747 bool isAltShuffle()
const {
return MainOp != AltOp; }
2750 unsigned CheckedOpcode =
I->getOpcode();
2751 return (getOpcode() == CheckedOpcode ||
2752 getAltOpcode() == CheckedOpcode);
2759 auto *
I = dyn_cast<Instruction>(
Op);
2760 if (
I && isOpcodeOrAlt(
I))
2765 void setOperations(
const InstructionsState &S) {
2779 unsigned getOpcode()
const {
2780 return MainOp ? MainOp->
getOpcode() : 0;
2783 unsigned getAltOpcode()
const {
2789 int findLaneForValue(
Value *V)
const {
2790 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2791 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2792 if (!ReorderIndices.
empty())
2793 FoundLane = ReorderIndices[FoundLane];
2794 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2795 if (!ReuseShuffleIndices.
empty()) {
2796 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2797 find(ReuseShuffleIndices, FoundLane));
2814 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2815 dbgs() <<
"Operand " << OpI <<
":\n";
2816 for (
const Value *V : Operands[OpI])
2819 dbgs() <<
"Scalars: \n";
2820 for (
Value *V : Scalars)
2822 dbgs() <<
"State: ";
2825 dbgs() <<
"Vectorize\n";
2827 case ScatterVectorize:
2828 dbgs() <<
"ScatterVectorize\n";
2830 case PossibleStridedVectorize:
2831 dbgs() <<
"PossibleStridedVectorize\n";
2834 dbgs() <<
"NeedToGather\n";
2837 dbgs() <<
"MainOp: ";
2839 dbgs() << *MainOp <<
"\n";
2842 dbgs() <<
"AltOp: ";
2844 dbgs() << *AltOp <<
"\n";
2847 dbgs() <<
"VectorizedValue: ";
2848 if (VectorizedValue)
2849 dbgs() << *VectorizedValue <<
"\n";
2852 dbgs() <<
"ReuseShuffleIndices: ";
2853 if (ReuseShuffleIndices.
empty())
2856 for (
int ReuseIdx : ReuseShuffleIndices)
2857 dbgs() << ReuseIdx <<
", ";
2859 dbgs() <<
"ReorderIndices: ";
2860 for (
unsigned ReorderIdx : ReorderIndices)
2861 dbgs() << ReorderIdx <<
", ";
2863 dbgs() <<
"UserTreeIndices: ";
2864 for (
const auto &EInfo : UserTreeIndices)
2865 dbgs() << EInfo <<
", ";
2875 dbgs() <<
"SLP: " << Banner <<
":\n";
2877 dbgs() <<
"SLP: Costs:\n";
2878 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2879 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2880 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2881 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2882 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2888 std::optional<ScheduleData *> Bundle,
2889 const InstructionsState &S,
2890 const EdgeInfo &UserTreeIdx,
2893 TreeEntry::EntryState EntryState =
2894 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2895 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2896 ReuseShuffleIndices, ReorderIndices);
2900 TreeEntry::EntryState EntryState,
2901 std::optional<ScheduleData *> Bundle,
2902 const InstructionsState &S,
2903 const EdgeInfo &UserTreeIdx,
2906 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2907 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2908 "Need to vectorize gather entry?");
2909 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2910 TreeEntry *
Last = VectorizableTree.
back().get();
2911 Last->Idx = VectorizableTree.
size() - 1;
2912 Last->State = EntryState;
2913 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2914 ReuseShuffleIndices.end());
2915 if (ReorderIndices.
empty()) {
2917 Last->setOperations(S);
2920 Last->Scalars.assign(VL.
size(),
nullptr);
2923 if (Idx >= VL.size())
2924 return UndefValue::get(VL.front()->getType());
2928 Last->setOperations(S);
2929 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2931 if (
Last->State != TreeEntry::NeedToGather) {
2932 for (
Value *V : VL) {
2933 const TreeEntry *
TE = getTreeEntry(V);
2935 "Scalar already in tree!");
2938 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
2941 ScalarToTreeEntry[
V] =
Last;
2944 ScheduleData *BundleMember = *Bundle;
2945 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2948 "Bundle and VL out of sync");
2950 for (
Value *V : VL) {
2955 BundleMember->TE =
Last;
2956 BundleMember = BundleMember->NextInBundle;
2959 assert(!BundleMember &&
"Bundle and VL out of sync");
2961 MustGather.
insert(VL.begin(), VL.end());
2964 if (UserTreeIdx.UserTE)
2965 Last->UserTreeIndices.push_back(UserTreeIdx);
2972 TreeEntry::VecTreeTy VectorizableTree;
2977 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2978 VectorizableTree[
Id]->dump();
2984 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
2986 const TreeEntry *getTreeEntry(
Value *V)
const {
2987 return ScalarToTreeEntry.lookup(V);
2992 TreeEntry::EntryState getScalarsVectorizationState(
3022 using ValueToGatherNodesMap =
3024 ValueToGatherNodesMap ValueToGatherNodes;
3027 struct ExternalUser {
3049 AliasCacheKey key = std::make_pair(Inst1, Inst2);
3050 std::optional<bool> &result = AliasCache[key];
3054 bool aliased =
true;
3062 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3090 UserList ExternalUses;
3106 struct ScheduleData {
3109 enum { InvalidDeps = -1 };
3111 ScheduleData() =
default;
3113 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3114 FirstInBundle =
this;
3115 NextInBundle =
nullptr;
3116 NextLoadStore =
nullptr;
3117 IsScheduled =
false;
3118 SchedulingRegionID = BlockSchedulingRegionID;
3119 clearDependencies();
3126 if (hasValidDependencies()) {
3127 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3129 assert(UnscheduledDeps == Dependencies &&
"invariant");
3133 assert(isSchedulingEntity() &&
3134 "unexpected scheduled state");
3135 for (
const ScheduleData *BundleMember =
this; BundleMember;
3136 BundleMember = BundleMember->NextInBundle) {
3137 assert(BundleMember->hasValidDependencies() &&
3138 BundleMember->UnscheduledDeps == 0 &&
3139 "unexpected scheduled state");
3140 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3141 "only bundle is marked scheduled");
3145 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3146 "all bundle members must be in same basic block");
3152 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3156 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3160 bool isPartOfBundle()
const {
3161 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3166 bool isReady()
const {
3167 assert(isSchedulingEntity() &&
3168 "can't consider non-scheduling entity for ready list");
3169 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3175 int incrementUnscheduledDeps(
int Incr) {
3176 assert(hasValidDependencies() &&
3177 "increment of unscheduled deps would be meaningless");
3178 UnscheduledDeps += Incr;
3179 return FirstInBundle->unscheduledDepsInBundle();
3184 void resetUnscheduledDeps() {
3185 UnscheduledDeps = Dependencies;
3189 void clearDependencies() {
3190 Dependencies = InvalidDeps;
3191 resetUnscheduledDeps();
3192 MemoryDependencies.clear();
3193 ControlDependencies.clear();
3196 int unscheduledDepsInBundle()
const {
3197 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3199 for (
const ScheduleData *BundleMember =
this; BundleMember;
3200 BundleMember = BundleMember->NextInBundle) {
3201 if (BundleMember->UnscheduledDeps == InvalidDeps)
3203 Sum += BundleMember->UnscheduledDeps;
3209 if (!isSchedulingEntity()) {
3210 os <<
"/ " << *Inst;
3211 }
else if (NextInBundle) {
3213 ScheduleData *SD = NextInBundle;
3215 os <<
';' << *SD->Inst;
3216 SD = SD->NextInBundle;
3227 Value *OpValue =
nullptr;
3230 TreeEntry *
TE =
nullptr;
3234 ScheduleData *FirstInBundle =
nullptr;
3238 ScheduleData *NextInBundle =
nullptr;
3242 ScheduleData *NextLoadStore =
nullptr;
3256 int SchedulingRegionID = 0;
3259 int SchedulingPriority = 0;
3265 int Dependencies = InvalidDeps;
3271 int UnscheduledDeps = InvalidDeps;
3275 bool IsScheduled =
false;
3280 const BoUpSLP::ScheduleData &SD) {
3305 struct BlockScheduling {
3307 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3311 ScheduleStart =
nullptr;
3312 ScheduleEnd =
nullptr;
3313 FirstLoadStoreInRegion =
nullptr;
3314 LastLoadStoreInRegion =
nullptr;
3315 RegionHasStackSave =
false;
3319 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3322 ScheduleRegionSize = 0;
3326 ++SchedulingRegionID;
3330 if (BB !=
I->getParent())
3333 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3334 if (SD && isInSchedulingRegion(SD))
3339 ScheduleData *getScheduleData(
Value *V) {
3340 if (
auto *
I = dyn_cast<Instruction>(V))
3341 return getScheduleData(
I);
3345 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3347 return getScheduleData(V);
3348 auto I = ExtraScheduleDataMap.find(V);
3349 if (
I != ExtraScheduleDataMap.end()) {
3350 ScheduleData *SD =
I->second.lookup(Key);
3351 if (SD && isInSchedulingRegion(SD))
3357 bool isInSchedulingRegion(ScheduleData *SD)
const {
3358 return SD->SchedulingRegionID == SchedulingRegionID;
3363 template <
typename ReadyListType>
3364 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3365 SD->IsScheduled =
true;
3368 for (ScheduleData *BundleMember = SD; BundleMember;
3369 BundleMember = BundleMember->NextInBundle) {
3370 if (BundleMember->Inst != BundleMember->OpValue)
3376 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3377 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3378 if (OpDef && OpDef->hasValidDependencies() &&
3379 OpDef->incrementUnscheduledDeps(-1) == 0) {
3383 ScheduleData *DepBundle = OpDef->FirstInBundle;
3384 assert(!DepBundle->IsScheduled &&
3385 "already scheduled bundle gets ready");
3386 ReadyList.insert(DepBundle);
3388 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3396 if (TreeEntry *TE = BundleMember->TE) {
3398 int Lane = std::distance(
TE->Scalars.begin(),
3399 find(
TE->Scalars, BundleMember->Inst));
3400 assert(Lane >= 0 &&
"Lane not set");
3408 auto *
In = BundleMember->Inst;
3410 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3411 In->getNumOperands() ==
TE->getNumOperands()) &&
3412 "Missed TreeEntry operands?");
3415 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3416 OpIdx != NumOperands; ++OpIdx)
3417 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3422 for (
Use &U : BundleMember->Inst->operands())
3423 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3427 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3428 if (MemoryDepSD->hasValidDependencies() &&
3429 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3432 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3433 assert(!DepBundle->IsScheduled &&
3434 "already scheduled bundle gets ready");
3435 ReadyList.insert(DepBundle);
3437 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3441 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3442 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3445 ScheduleData *DepBundle = DepSD->FirstInBundle;
3446 assert(!DepBundle->IsScheduled &&
3447 "already scheduled bundle gets ready");
3448 ReadyList.insert(DepBundle);
3450 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3461 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3462 ScheduleStart->comesBefore(ScheduleEnd) &&
3463 "Not a valid scheduling region?");
3465 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3466 auto *SD = getScheduleData(
I);
3469 assert(isInSchedulingRegion(SD) &&
3470 "primary schedule data not in window?");
3471 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3472 "entire bundle in window!");
3474 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3477 for (
auto *SD : ReadyInsts) {
3478 assert(SD->isSchedulingEntity() && SD->isReady() &&
3479 "item in ready list not ready?");
3484 void doForAllOpcodes(
Value *V,
3486 if (ScheduleData *SD = getScheduleData(V))
3488 auto I = ExtraScheduleDataMap.find(V);
3489 if (
I != ExtraScheduleDataMap.end())
3490 for (
auto &
P :
I->second)
3491 if (isInSchedulingRegion(
P.second))
3496 template <
typename ReadyListType>
3497 void initialFillReadyList(ReadyListType &ReadyList) {
3498 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3499 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3500 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3502 ReadyList.insert(SD);
3504 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3519 std::optional<ScheduleData *>
3521 const InstructionsState &S);
3527 ScheduleData *allocateScheduleDataChunks();
3531 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3536 ScheduleData *PrevLoadStore,
3537 ScheduleData *NextLoadStore);
3541 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3545 void resetSchedule();
3550 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3566 ExtraScheduleDataMap;
3579 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3583 ScheduleData *LastLoadStoreInRegion =
nullptr;
3588 bool RegionHasStackSave =
false;
3591 int ScheduleRegionSize = 0;
3600 int SchedulingRegionID = 1;
3608 void scheduleBlock(BlockScheduling *BS);
3615 struct OrdersTypeDenseMapInfo {
3628 static unsigned getHashValue(
const OrdersType &V) {
3649 unsigned MaxVecRegSize;
3650 unsigned MinVecRegSize;
3675 struct ChildIteratorType
3677 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3688 return R.VectorizableTree[0].get();
3692 return {
N->UserTreeIndices.begin(),
N->Container};
3696 return {
N->UserTreeIndices.end(),
N->Container};
3701 class nodes_iterator {
3712 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3716 return nodes_iterator(R->VectorizableTree.begin());
3720 return nodes_iterator(R->VectorizableTree.end());
3723 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3734 OS << Entry->Idx <<
".\n";
3737 for (
auto *V : Entry->Scalars) {
3739 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3740 return EU.Scalar == V;
3750 if (Entry->State == TreeEntry::NeedToGather)
3752 if (Entry->State == TreeEntry::ScatterVectorize ||
3753 Entry->State == TreeEntry::PossibleStridedVectorize)
3754 return "color=blue";
3763 for (
auto *
I : DeletedInstructions) {
3764 for (
Use &U :
I->operands()) {
3765 auto *
Op = dyn_cast<Instruction>(U.get());
3766 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3770 I->dropAllReferences();
3772 for (
auto *
I : DeletedInstructions) {
3774 "trying to erase instruction with users.");
3775 I->eraseFromParent();
3781#ifdef EXPENSIVE_CHECKS
3792 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3793 "Expected non-empty mask.");
3796 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3798 Reuses[Mask[
I]] = Prev[
I];
3806 assert(!Mask.empty() &&
"Expected non-empty mask.");
3808 if (Order.
empty()) {
3809 MaskOrder.
resize(Mask.size());
3810 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3819 Order.
assign(Mask.size(), Mask.size());
3820 for (
unsigned I = 0,
E = Mask.size();
I <
E; ++
I)
3822 Order[MaskOrder[
I]] =
I;
3826std::optional<BoUpSLP::OrdersType>
3828 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3829 unsigned NumScalars = TE.Scalars.size();
3830 OrdersType CurrentOrder(NumScalars, NumScalars);
3833 const TreeEntry *STE =
nullptr;
3837 for (
unsigned I = 0;
I < NumScalars; ++
I) {
3838 Value *V = TE.Scalars[
I];
3839 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3841 if (
const auto *LocalSTE = getTreeEntry(V)) {
3844 else if (STE != LocalSTE)
3846 return std::nullopt;
3848 std::distance(STE->Scalars.begin(),
find(STE->Scalars, V));
3849 if (Lane >= NumScalars)
3850 return std::nullopt;
3851 if (CurrentOrder[Lane] != NumScalars) {
3854 UsedPositions.
reset(CurrentOrder[Lane]);
3858 CurrentOrder[Lane] =
I;
3859 UsedPositions.
set(
I);
3864 if (STE && (UsedPositions.
count() > 1 || STE->Scalars.size() == 2)) {
3866 for (
unsigned I = 0;
I < NumScalars; ++
I)
3867 if (CurrentOrder[
I] !=
I && CurrentOrder[
I] != NumScalars)
3871 if (IsIdentityOrder(CurrentOrder))
3873 auto *It = CurrentOrder.
begin();
3874 for (
unsigned I = 0;
I < NumScalars;) {
3875 if (UsedPositions.
test(
I)) {
3879 if (*It == NumScalars) {
3885 return std::move(CurrentOrder);
3887 return std::nullopt;
3892enum class LoadsState {
3896 PossibleStridedVectorize
3902 bool CompareOpcodes =
true) {
3905 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3908 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3911 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3915 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3935 if (
DL.getTypeSizeInBits(ScalarTy) !=
DL.getTypeAllocSizeInBits(ScalarTy))
3936 return LoadsState::Gather;
3942 auto *POIter = PointerOps.
begin();
3943 for (
Value *V : VL) {
3944 auto *L = cast<LoadInst>(V);
3946 return LoadsState::Gather;
3947 *POIter = L->getPointerOperand();
3957 bool IsPossibleStrided =
false;
3961 if (Order.
empty()) {
3962 Ptr0 = PointerOps.
front();
3963 PtrN = PointerOps.
back();
3965 Ptr0 = PointerOps[Order.
front()];
3966 PtrN = PointerOps[Order.
back()];
3968 std::optional<int> Diff =
3971 if (
static_cast<unsigned>(*Diff) == VL.
size() - 1)
3972 return LoadsState::Vectorize;
3974 IsPossibleStrided = *Diff % (VL.
size() - 1) == 0;
3980 bool ProfitableGatherPointers =
3982 return L && L->isLoopInvariant(V);
3983 })) <= VL.
size() / 2 && VL.
size() > 2;
3984 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
3985 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
3987 (
GEP &&
GEP->getNumOperands() == 2);
3989 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3992 std::min(CommonAlignment, cast<LoadInst>(V)->
getAlign());
3996 return IsPossibleStrided ? LoadsState::PossibleStridedVectorize
3997 : LoadsState::ScatterVectorize;
4001 return LoadsState::Gather;
4008 VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
4009 "Expected list of pointer operands.");
4014 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4019 std::optional<int> Diff =
4025 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4031 if (Bases.
size() > VL.
size() / 2 - 1)
4035 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4041 bool AnyConsecutive =
false;
4042 for (
auto &
Base : Bases) {
4043 auto &Vec =
Base.second;
4044 if (Vec.size() > 1) {
4046 const std::tuple<Value *, int, unsigned> &
Y) {
4047 return std::get<1>(
X) < std::get<1>(
Y);
4049 int InitialOffset = std::get<1>(Vec[0]);
4051 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4057 SortedIndices.
clear();
4058 if (!AnyConsecutive)
4061 for (
auto &
Base : Bases) {
4062 for (
auto &
T :
Base.second)
4067 "Expected SortedIndices to be the size of VL");
4071std::optional<BoUpSLP::OrdersType>
4073 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4074 Type *ScalarTy = TE.Scalars[0]->getType();
4077 Ptrs.
reserve(TE.Scalars.size());
4078 for (
Value *V : TE.Scalars) {
4079 auto *L = dyn_cast<LoadInst>(V);
4080 if (!L || !L->isSimple())
4081 return std::nullopt;
4087 return std::move(Order);
4088 return std::nullopt;
4099 if (VU->
getType() != V->getType())
4102 if (!VU->
hasOneUse() && !V->hasOneUse())
4108 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4114 bool IsReusedIdx =
false;
4116 if (IE2 == VU && !IE1)
4118 if (IE1 == V && !IE2)
4119 return V->hasOneUse();
4120 if (IE1 && IE1 != V) {
4123 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4126 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4128 if (IE2 && IE2 != VU) {
4131 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4134 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4136 }
while (!IsReusedIdx && (IE1 || IE2));
4140std::optional<BoUpSLP::OrdersType>
4144 if (!TE.ReuseShuffleIndices.empty()) {
4152 unsigned Sz = TE.Scalars.size();
4155 return std::nullopt;
4156 unsigned VF = TE.getVectorFactor();
4159 TE.ReuseShuffleIndices.end());
4160 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4162 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4163 return Idx && *Idx < Sz;
4166 if (TE.ReorderIndices.empty())
4167 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4170 for (
unsigned I = 0;
I < VF; ++
I) {
4171 int &
Idx = ReusedMask[
I];
4174 Value *V = TE.Scalars[ReorderMask[
Idx]];
4176 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4182 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4183 auto *It = ResOrder.
begin();
4184 for (
unsigned K = 0; K < VF; K += Sz) {
4188 std::iota(SubMask.begin(), SubMask.end(), 0);
4190 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4191 std::advance(It, Sz);
4194 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4195 return std::nullopt;
4196 return std::move(ResOrder);
4198 if ((TE.State == TreeEntry::Vectorize ||
4199 TE.State == TreeEntry::PossibleStridedVectorize) &&
4200 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4201 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4203 return TE.ReorderIndices;
4204 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4205 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4206 Value *V1 = TE.Scalars[I1];
4207 Value *V2 = TE.Scalars[I2];
4210 if (!V1->
hasOneUse() || !V2->hasOneUse())
4212 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4213 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4214 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4215 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4222 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4224 return *Idx1 < *Idx2;
4226 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4227 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4228 if (EE1->getOperand(0) != EE2->getOperand(0))
4232 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4234 return *Idx1 < *Idx2;
4238 auto IsIdentityOrder = [](
const OrdersType &Order) {
4239 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4244 if (!TE.ReorderIndices.empty())
4245 return TE.ReorderIndices;
4248 std::iota(Phis.begin(), Phis.end(), 0);
4250 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4253 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4254 ResOrder[Id] = PhiToId[Phis[Id]];
4255 if (IsIdentityOrder(ResOrder))
4256 return std::nullopt;
4257 return std::move(ResOrder);
4259 if (TE.State == TreeEntry::NeedToGather) {
4262 if (((TE.getOpcode() == Instruction::ExtractElement &&
4263 !TE.isAltShuffle()) ||
4266 return isa<UndefValue, ExtractElementInst>(V);
4269 [](
Value *V) { return isa<ExtractElementInst>(V); }))) &&
4272 auto *EE = dyn_cast<ExtractElementInst>(V);
4273 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4279 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4281 if (Reuse || !CurrentOrder.
empty()) {
4282 if (!CurrentOrder.
empty())
4284 return std::move(CurrentOrder);
4293 int Sz = TE.Scalars.size();
4297 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4298 if (It == TE.Scalars.begin())
4301 if (It != TE.Scalars.end()) {
4303 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4318 if (InsertFirstCost + PermuteCost < InsertIdxCost)
4319 return std::move(Order);
4323 return CurrentOrder;
4324 if (TE.Scalars.size() >= 4)
4328 return std::nullopt;
4338 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4340 if (Cluster != FirstCluster)
4346void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4349 const unsigned Sz =
TE.Scalars.size();
4351 if (
TE.State != TreeEntry::NeedToGather ||
4358 addMask(NewMask,
TE.ReuseShuffleIndices);
4360 TE.ReorderIndices.clear();
4367 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4368 *
End =
TE.ReuseShuffleIndices.end();
4369 It !=
End; std::advance(It, Sz))
4370 std::iota(It, std::next(It, Sz), 0);
4389 ExternalUserReorderMap;
4395 for_each(VectorizableTree, [
this, &TTIRef, &VFToOrderedEntries,
4396 &GathersToOrders, &ExternalUserReorderMap,
4397 &AltShufflesToOrders, &PhisToOrders](
4398 const std::unique_ptr<TreeEntry> &TE) {
4401 findExternalStoreUsersReorderIndices(TE.get());
4402 if (!ExternalUserReorderIndices.
empty()) {
4403 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4405 std::move(ExternalUserReorderIndices));
4411 if (TE->isAltShuffle()) {
4414 unsigned Opcode0 = TE->getOpcode();
4415 unsigned Opcode1 = TE->getAltOpcode();
4418 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4419 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4420 OpcodeMask.
set(Lane);
4423 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4429 if (std::optional<OrdersType> CurrentOrder =
4439 const TreeEntry *UserTE = TE.get();
4441 if (UserTE->UserTreeIndices.size() != 1)
4444 return EI.UserTE->State == TreeEntry::Vectorize &&
4445 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4448 UserTE = UserTE->UserTreeIndices.back().UserTE;
4451 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4452 if (!(TE->State == TreeEntry::Vectorize ||
4453 TE->State == TreeEntry::PossibleStridedVectorize) ||
4454 !TE->ReuseShuffleIndices.empty())
4455 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4456 if (TE->State == TreeEntry::Vectorize &&
4457 TE->getOpcode() == Instruction::PHI)
4458 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
4463 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4465 auto It = VFToOrderedEntries.
find(VF);
4466 if (It == VFToOrderedEntries.
end())
4481 for (
const TreeEntry *OpTE : OrderedEntries) {
4484 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4487 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4489 if (OpTE->State == TreeEntry::NeedToGather ||
4490 !OpTE->ReuseShuffleIndices.empty()) {
4491 auto It = GathersToOrders.find(OpTE);
4492 if (It != GathersToOrders.end())
4495 if (OpTE->isAltShuffle()) {
4496 auto It = AltShufflesToOrders.find(OpTE);
4497 if (It != AltShufflesToOrders.end())
4500 if (OpTE->State == TreeEntry::Vectorize &&
4501 OpTE->getOpcode() == Instruction::PHI) {
4502 auto It = PhisToOrders.
find(OpTE);
4503 if (It != PhisToOrders.
end())
4506 return OpTE->ReorderIndices;
4509 auto It = ExternalUserReorderMap.
find(OpTE);
4510 if (It != ExternalUserReorderMap.
end()) {
4511 const auto &ExternalUserReorderIndices = It->second;
4515 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4516 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
4517 ExternalUserReorderIndices.size();
4519 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
4520 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4527 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4528 StridedVectorizeOrders.
push_back(Order);
4532 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4533 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4536 unsigned E = Order.size();
4539 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4542 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4544 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4548 if (OrdersUses.empty()) {
4549 if (StridedVectorizeOrders.
empty())
4552 for (
OrdersType &Order : StridedVectorizeOrders)
4553 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4557 for (
OrdersType &Order : StridedVectorizeOrders) {
4558 auto *It = OrdersUses.find(Order);
4559 if (It != OrdersUses.end())
4565 unsigned Cnt = OrdersUses.front().second;
4566 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4567 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4568 BestOrder = Pair.first;
4573 if (BestOrder.
empty())
4578 unsigned E = BestOrder.
size();
4580 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4583 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4585 if (TE->Scalars.size() != VF) {
4586 if (TE->ReuseShuffleIndices.size() == VF) {
4592 return EI.UserTE->Scalars.size() == VF ||
4593 EI.UserTE->Scalars.size() ==
4596 "All users must be of VF size.");
4599 reorderNodeWithReuses(*TE, Mask);
4603 if ((TE->State == TreeEntry::Vectorize ||
4604 TE->State == TreeEntry::PossibleStridedVectorize) &&
4607 !TE->isAltShuffle()) {
4611 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4612 TE->reorderOperands(Mask);
4615 TE->reorderOperands(Mask);
4616 assert(TE->ReorderIndices.empty() &&
4617 "Expected empty reorder sequence.");
4620 if (!TE->ReuseShuffleIndices.empty()) {
4627 addMask(NewReuses, TE->ReuseShuffleIndices);
4628 TE->ReuseShuffleIndices.swap(NewReuses);
4634bool BoUpSLP::canReorderOperands(
4635 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4638 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
4639 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
4640 return OpData.first ==
I &&
4641 OpData.second->State == TreeEntry::Vectorize;
4644 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
4647 if (TE->State == TreeEntry::PossibleStridedVectorize)
4650 if (
any_of(TE->UserTreeIndices,
4651 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4655 Edges.emplace_back(
I, TE);
4661 if (TE->State != TreeEntry::Vectorize &&
4662 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
4666 TreeEntry *Gather =
nullptr;
4668 [&Gather, UserTE,
I](TreeEntry *TE) {
4669 assert(TE->State != TreeEntry::Vectorize &&
4670 "Only non-vectorized nodes are expected.");
4671 if (
any_of(TE->UserTreeIndices,
4672 [UserTE,
I](
const EdgeInfo &EI) {
4673 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4675 assert(TE->isSame(UserTE->getOperand(
I)) &&
4676 "Operand entry does not match operands.");
4697 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4698 if (TE->State != TreeEntry::Vectorize &&
4699 TE->State != TreeEntry::PossibleStridedVectorize)
4701 if (std::optional<OrdersType> CurrentOrder =
4703 OrderedEntries.
insert(TE.get());
4704 if (!(TE->State == TreeEntry::Vectorize ||
4705 TE->State == TreeEntry::PossibleStridedVectorize) ||
4706 !TE->ReuseShuffleIndices.empty())
4707 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4716 while (!OrderedEntries.
empty()) {
4721 for (TreeEntry *TE : OrderedEntries) {
4722 if (!(TE->State == TreeEntry::Vectorize ||
4723 TE->State == TreeEntry::PossibleStridedVectorize ||
4724 (TE->State == TreeEntry::NeedToGather &&
4725 GathersToOrders.
count(TE))) ||
4726 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4729 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4731 !Visited.
insert(TE).second) {
4737 for (
EdgeInfo &EI : TE->UserTreeIndices) {
4738 TreeEntry *UserTE = EI.
UserTE;
4739 auto It =
Users.find(UserTE);
4740 if (It ==
Users.end())
4741 It =
Users.insert({UserTE, {}}).first;
4742 It->second.emplace_back(EI.
EdgeIdx, TE);
4746 for (TreeEntry *TE : Filtered)
4747 OrderedEntries.remove(TE);
4749 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4751 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
4752 return Data1.first->Idx > Data2.first->Idx;
4754 for (
auto &
Data : UsersVec) {
4757 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
4759 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
4760 OrderedEntries.remove(
Op.second);
4776 for (
const auto &
Op :
Data.second) {
4777 TreeEntry *OpTE =
Op.second;
4778 if (!VisitedOps.
insert(OpTE).second)
4780 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4782 const auto &Order = [OpTE, &GathersToOrders]() ->
const OrdersType & {
4783 if (OpTE->State == TreeEntry::NeedToGather ||
4784 !OpTE->ReuseShuffleIndices.empty())
4785 return GathersToOrders.
find(OpTE)->second;
4786 return OpTE->ReorderIndices;
4789 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
4790 return P.second == OpTE;
4793 if (OpTE->State == TreeEntry::PossibleStridedVectorize) {
4798 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4799 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4802 unsigned E = Order.size();
4805 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
4808 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4811 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4813 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
4814 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4815 const TreeEntry *TE) {
4816 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4817 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4818 (IgnoreReorder && TE->Idx == 0))
4820 if (TE->State == TreeEntry::NeedToGather) {
4821 auto It = GathersToOrders.
find(TE);
4822 if (It != GathersToOrders.
end())
4823 return !It->second.empty();
4828 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
4829 TreeEntry *UserTE = EI.
UserTE;
4830 if (!VisitedUsers.
insert(UserTE).second)
4835 if (AllowsReordering(UserTE))
4843 if (
static_cast<unsigned>(
count_if(
4844 Ops, [UserTE, &AllowsReordering](
4845 const std::pair<unsigned, TreeEntry *> &
Op) {
4846 return AllowsReordering(
Op.second) &&
4849 return EI.UserTE == UserTE;
4851 })) <= Ops.
size() / 2)
4852 ++Res.first->second;
4856 if (OrdersUses.empty()) {
4857 if (StridedVectorizeOrders.
empty() ||
4858 (
Data.first->ReorderIndices.empty() &&
4859 Data.first->ReuseShuffleIndices.empty() &&
4861 Data.first == VectorizableTree.front().get()))) {
4862 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
4863 OrderedEntries.remove(
Op.second);
4867 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders)
4868 OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second +=
4873 for (std::pair<OrdersType, unsigned> &Pair : StridedVectorizeOrders) {
4874 auto *It = OrdersUses.find(Pair.first);
4875 if (It != OrdersUses.end())
4876 It->second += Pair.second;
4881 unsigned Cnt = OrdersUses.front().second;
4882 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4883 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4884 BestOrder = Pair.first;
4889 if (BestOrder.
empty()) {
4890 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
4891 OrderedEntries.remove(
Op.second);
4899 unsigned E = BestOrder.
size();
4901 return I < E ? static_cast<int>(I) : PoisonMaskElem;
4903 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
4904 TreeEntry *TE =
Op.second;
4905 OrderedEntries.remove(TE);
4906 if (!VisitedOps.
insert(TE).second)
4908 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
4909 reorderNodeWithReuses(*TE, Mask);
4913 if (TE->State != TreeEntry::Vectorize &&
4914 TE->State != TreeEntry::PossibleStridedVectorize &&
4915 (TE->State != TreeEntry::ScatterVectorize ||
4916 TE->ReorderIndices.empty()))
4918 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
4919 TE->ReorderIndices.empty()) &&
4920 "Non-matching sizes of user/operand entries.");
4922 if (IgnoreReorder && TE == VectorizableTree.front().get())
4923 IgnoreReorder =
false;
4926 for (TreeEntry *Gather : GatherOps) {
4927 assert(Gather->ReorderIndices.empty() &&
4928 "Unexpected reordering of gathers.");
4929 if (!Gather->ReuseShuffleIndices.empty()) {
4935 OrderedEntries.remove(Gather);
4939 if (
Data.first->State != TreeEntry::Vectorize ||
4940 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4941 Data.first->getMainOp()) ||
4942 Data.first->isAltShuffle())
4943 Data.first->reorderOperands(Mask);
4944 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
4945 Data.first->isAltShuffle() ||
4946 Data.first->State == TreeEntry::PossibleStridedVectorize) {
4949 if (
Data.first->ReuseShuffleIndices.empty() &&
4950 !
Data.first->ReorderIndices.empty() &&
4951 !
Data.first->isAltShuffle()) {
4954 OrderedEntries.insert(
Data.first);
4962 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4963 VectorizableTree.front()->ReuseShuffleIndices.empty())
4964 VectorizableTree.front()->ReorderIndices.clear();
4970 for (
auto &TEPtr : VectorizableTree) {
4971 TreeEntry *Entry = TEPtr.get();
4974 if (Entry->State == TreeEntry::NeedToGather)
4978 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4979 Value *Scalar = Entry->Scalars[Lane];
4980 if (!isa<Instruction>(Scalar))
4982 int FoundLane = Entry->findLaneForValue(Scalar);
4985 auto ExtI = ExternallyUsedValues.
find(Scalar);
4986 if (ExtI != ExternallyUsedValues.
end()) {
4987 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
4988 << Lane <<
" from " << *Scalar <<
".\n");
4989 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
4991 for (
User *U : Scalar->users()) {
5002 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5003 Value *UseScalar = UseEntry->Scalars[0];
5007 if (UseScalar != U ||
5008 UseEntry->State == TreeEntry::ScatterVectorize ||
5009 UseEntry->State == TreeEntry::PossibleStridedVectorize ||
5011 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5013 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5019 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5022 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane "
5023 << Lane <<
" from " << *Scalar <<
".\n");
5024 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
5031BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5033 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5034 Value *V = TE->Scalars[Lane];
5036 static constexpr unsigned UsersLimit = 4;
5037 if (V->hasNUsesOrMore(UsersLimit))
5041 for (
User *U : V->users()) {
5042 auto *SI = dyn_cast<StoreInst>(U);
5043 if (SI ==
nullptr || !SI->isSimple() ||
5047 if (getTreeEntry(U))
5051 auto &StoresVec = PtrToStoresMap[