72#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
113 cl::desc(
"Run the SLP vectorization passes"));
117 cl::desc(
"Only vectorize if you gain more than this "
122 cl::desc(
"Attempt to vectorize horizontal reductions"));
127 "Attempt to vectorize horizontal reductions feeding into a store"));
133 cl::desc(
"Allow optimization of original scalar identity operations on "
134 "matched horizontal reductions."));
138 cl::desc(
"Attempt to vectorize for this register size in bits"));
142 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
146 cl::desc(
"Maximum depth of the lookup for consecutive stores."));
154 cl::desc(
"Limit the size of the SLP scheduling region per block"));
158 cl::desc(
"Attempt to vectorize for this register size in bits"));
162 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
166 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
172 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
181 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
185 cl::desc(
"Display the SLP trees with Graphviz"));
208 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
215 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
222 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
223 !isa<ExtractValueInst, UndefValue>(V))
225 auto *
I = dyn_cast<Instruction>(V);
226 if (!
I || isa<ExtractValueInst>(
I))
228 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
230 if (isa<ExtractElementInst>(
I))
232 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
246 for (
int I = 1,
E = VL.
size();
I <
E;
I++) {
247 auto *II = dyn_cast<Instruction>(VL[
I]);
268 Value *FirstNonUndef =
nullptr;
269 for (
Value *V : VL) {
270 if (isa<UndefValue>(V))
272 if (!FirstNonUndef) {
276 if (V != FirstNonUndef)
279 return FirstNonUndef !=
nullptr;
284 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
285 return Cmp->isCommutative();
286 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
287 return BO->isCommutative();
299 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
300 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
303 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
306 if (CI->getValue().uge(VT->getNumElements()))
308 Index *= VT->getNumElements();
309 Index += CI->getZExtValue();
313 const auto *
IV = cast<InsertValueInst>(InsertInst);
314 Type *CurrentType =
IV->getType();
315 for (
unsigned I :
IV->indices()) {
316 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
317 Index *= ST->getNumElements();
318 CurrentType = ST->getElementType(
I);
319 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
320 Index *= AT->getNumElements();
321 CurrentType = AT->getElementType();
354 if (MaskArg == UseMask::UndefsAsMask)
358 if (MaskArg == UseMask::FirstArg &&
Value < VF)
359 UseMask.reset(
Value);
360 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
361 UseMask.reset(
Value - VF);
369template <
bool IsPoisonOnly = false>
373 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
376 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
379 auto *
C = dyn_cast<Constant>(V);
381 if (!UseMask.empty()) {
383 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
384 Base = II->getOperand(0);
385 if (isa<T>(II->getOperand(1)))
390 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
398 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
405 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
406 if (
Constant *Elem =
C->getAggregateElement(
I))
408 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
456static std::optional<TargetTransformInfo::ShuffleKind>
459 find_if(VL, [](
Value *V) {
return isa<ExtractElementInst>(V); });
462 auto *EI0 = cast<ExtractElementInst>(*It);
463 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
466 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
467 Value *Vec1 =
nullptr;
468 Value *Vec2 =
nullptr;
470 ShuffleMode CommonShuffleMode =
Unknown;
472 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
474 if (isa<UndefValue>(VL[
I]))
476 auto *EI = cast<ExtractElementInst>(VL[
I]);
477 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
479 auto *Vec = EI->getVectorOperand();
484 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
486 if (isa<UndefValue>(EI->getIndexOperand()))
488 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
494 unsigned IntIdx =
Idx->getValue().getZExtValue();
498 if (!Vec1 || Vec1 == Vec) {
500 }
else if (!Vec2 || Vec2 == Vec) {
506 if (CommonShuffleMode == Permute)
511 CommonShuffleMode = Permute;
514 CommonShuffleMode =
Select;
517 if (CommonShuffleMode ==
Select && Vec2)
527 unsigned Opcode =
E->getOpcode();
528 assert((Opcode == Instruction::ExtractElement ||
529 Opcode == Instruction::ExtractValue) &&
530 "Expected extractelement or extractvalue instruction.");
531 if (Opcode == Instruction::ExtractElement) {
532 auto *CI = dyn_cast<ConstantInt>(
E->getOperand(1));
535 return CI->getZExtValue();
537 auto *EI = cast<ExtractValueInst>(
E);
538 if (EI->getNumIndices() != 1)
540 return *EI->idx_begin();
548static std::optional<TTI::ShuffleKind>
555 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
556 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
558 if (isa<UndefValue>(VL[
I]))
562 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
563 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
577 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
581 for (
const auto &
Data : VectorOpToIdx)
582 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
583 .push_back(
Data.first);
584 for (
auto &
Data : VFToVector) {
586 return VectorOpToIdx.find(V1)->second.size() >
587 VectorOpToIdx.find(V2)->second.size();
592 const int UndefSz = UndefVectorExtracts.
size();
593 unsigned SingleMax = 0;
594 Value *SingleVec =
nullptr;
595 unsigned PairMax = 0;
596 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
597 for (
auto &
Data : VFToVector) {
599 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
600 SingleMax = VectorOpToIdx[V1].
size() + UndefSz;
604 if (
Data.second.size() > 1)
605 V2 = *std::next(
Data.second.begin());
606 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
608 PairMax = VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() + UndefSz;
609 PairVec = std::make_pair(V1, V2);
612 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
619 if (SingleMax >= PairMax && SingleMax) {
620 for (
int Idx : VectorOpToIdx[SingleVec])
623 for (
Value *V : {PairVec.first, PairVec.second})
624 for (
int Idx : VectorOpToIdx[V])
628 for (
int Idx : UndefVectorExtracts)
632 std::optional<TTI::ShuffleKind> Res =
642 for (
int I = 0,
E = GatheredExtracts.
size();
I <
E; ++
I) {
643 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
644 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
645 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
648 if (Mask[
I] ==
UndefMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]))
657struct InstructionsState {
659 Value *OpValue =
nullptr;
670 unsigned getAltOpcode()
const {
675 bool isAltShuffle()
const {
return AltOp != MainOp; }
678 unsigned CheckedOpcode =
I->getOpcode();
679 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
682 InstructionsState() =
delete;
684 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
693 auto *
I = dyn_cast<Instruction>(Op);
694 if (
I && S.isOpcodeOrAlt(
I))
713 unsigned BaseIndex = 0);
721 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
722 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
723 BaseOp0 == Op0 || BaseOp1 == Op1 ||
734 "Assessing comparisons of different types?");
744 return (BasePred == Pred &&
746 (BasePred == SwappedPred &&
755 unsigned BaseIndex) {
758 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
760 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
761 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
762 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
764 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
766 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
767 unsigned AltOpcode = Opcode;
768 unsigned AltIndex = BaseIndex;
772 auto *IBase = cast<Instruction>(VL[BaseIndex]);
775 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
779 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
781 for (
int Cnt = 0,
E = VL.
size(); Cnt <
E; Cnt++) {
782 auto *
I = cast<Instruction>(VL[Cnt]);
783 unsigned InstOpcode =
I->getOpcode();
784 if (IsBinOp && isa<BinaryOperator>(
I)) {
785 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
789 AltOpcode = InstOpcode;
793 }
else if (IsCastOp && isa<CastInst>(
I)) {
794 Value *Op0 = IBase->getOperand(0);
796 Value *Op1 =
I->getOperand(0);
799 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
801 if (Opcode == AltOpcode) {
804 "Cast isn't safe for alternation, logic needs to be updated!");
805 AltOpcode = InstOpcode;
810 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
811 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
812 Type *Ty0 = BaseInst->getOperand(0)->getType();
813 Type *Ty1 = Inst->getOperand(0)->getType();
815 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
823 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
828 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
829 if (AltIndex != BaseIndex) {
832 }
else if (BasePred != CurrentPred) {
835 "CmpInst isn't safe for alternation, logic needs to be updated!");
840 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
841 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
844 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
845 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
846 if (Gep->getNumOperands() != 2 ||
847 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
848 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
849 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
851 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
852 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
853 auto *BaseLI = cast<LoadInst>(IBase);
854 if (!LI->isSimple() || !BaseLI->isSimple())
855 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
856 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
857 auto *
CallBase = cast<CallInst>(IBase);
859 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
860 if (Call->hasOperandBundles() &&
861 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
862 Call->op_begin() + Call->getBundleOperandsEndIndex(),
865 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
868 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
873 Mappings.front().ScalarName != BaseMappings.
front().ScalarName ||
874 Mappings.front().VectorName != BaseMappings.
front().VectorName ||
875 Mappings.front().Shape.VF != BaseMappings.
front().Shape.VF ||
876 Mappings.front().Shape.Parameters !=
877 BaseMappings.
front().Shape.Parameters)
878 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
883 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
886 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
887 cast<Instruction>(VL[AltIndex]));
893 Type *Ty = VL[0]->getType();
894 for (
int i = 1, e = VL.
size(); i < e; i++)
907 case Instruction::Load: {
908 LoadInst *LI = cast<LoadInst>(UserInst);
911 case Instruction::Store: {
913 return (
SI->getPointerOperand() == Scalar);
915 case Instruction::Call: {
916 CallInst *CI = cast<CallInst>(UserInst);
918 for (
unsigned i = 0, e = CI->
arg_size(); i != e; ++i) {
933 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
940 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
941 return LI->isSimple();
943 return SI->isSimple();
945 return !
MI->isVolatile();
954 Mask.append(SubMask.
begin(), SubMask.
end());
958 int TermValue = std::min(Mask.size(), SubMask.
size());
959 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
961 Mask[SubMask[
I]] >= TermValue)
963 NewMask[
I] = Mask[SubMask[
I]];
979 const unsigned Sz = Order.
size();
982 for (
unsigned I = 0;
I < Sz; ++
I) {
984 UnusedIndices.
reset(Order[
I]);
986 MaskedIndices.
set(
I);
988 if (MaskedIndices.
none())
991 "Non-synced masked/available indices.");
995 assert(
Idx >= 0 &&
"Indices must be synced.");
1007 const unsigned E = Indices.
size();
1009 for (
unsigned I = 0;
I <
E; ++
I)
1010 Mask[Indices[
I]] =
I;
1016 assert(!Mask.empty() &&
"Expected non-empty mask.");
1020 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1022 Scalars[Mask[
I]] = Prev[
I];
1030 auto *
I = dyn_cast<Instruction>(V);
1035 auto *IO = dyn_cast<Instruction>(V);
1038 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1047 auto *
I = dyn_cast<Instruction>(V);
1051 constexpr int UsesLimit = 8;
1052 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(UsesLimit) &&
1054 auto *IU = dyn_cast<Instruction>(U);
1057 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1073 return !VL.
empty() &&
1077namespace slpvectorizer {
1082 struct ScheduleData;
1098 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li),
1099 DT(Dt), AC(AC), DB(DB),
DL(
DL), ORE(ORE), Builder(Se->getContext()) {
1148 return !VectorizableTree.
empty() &&
1149 VectorizableTree.
front()->State == TreeEntry::Vectorize;
1154 assert(!VectorizableTree.
empty() &&
"No tree to get the first node from");
1155 return VectorizableTree.
front()->getMainOp();
1160 return !VectorizableTree.
empty() &&
1161 !VectorizableTree.
front()->UserTreeIndices.empty();
1173 VectorizableTree.
clear();
1174 ScalarToTreeEntry.clear();
1176 EntryToLastInstruction.clear();
1177 ExternalUses.
clear();
1178 for (
auto &Iter : BlocksSchedules) {
1179 BlockScheduling *BS = Iter.second.get();
1183 InstrElementSize.clear();
1184 UserIgnoreList =
nullptr;
1185 PostponedGathers.
clear();
1242 return MaxVecRegSize;
1247 return MinVecRegSize;
1255 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1257 return MaxVF ? MaxVF : UINT_MAX;
1312 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1313 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1332 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1333 MaxLevel(MaxLevel) {}
1387 if (isa<LoadInst>(V1)) {
1389 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1391 static constexpr unsigned Limit = 8;
1392 if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))
1395 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1397 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1400 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1403 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1405 ((
int)V1->getNumUses() == NumLanes ||
1406 AllUsersAreInternal(V1, V2)))
1412 auto *LI1 = dyn_cast<LoadInst>(V1);
1413 auto *LI2 = dyn_cast<LoadInst>(V2);
1415 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1420 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1421 LI2->getPointerOperand(),
DL, SE,
true);
1422 if (!Dist || *Dist == 0) {
1425 R.TTI->isLegalMaskedGather(
1433 if (std::abs(*Dist) > NumLanes / 2)
1442 auto *C1 = dyn_cast<Constant>(V1);
1443 auto *C2 = dyn_cast<Constant>(V2);
1457 if (isa<UndefValue>(V2))
1461 Value *EV2 =
nullptr;
1474 int Dist = Idx2 - Idx1;
1477 if (std::abs(Dist) == 0)
1479 if (std::abs(Dist) > NumLanes / 2)
1489 auto *I1 = dyn_cast<Instruction>(V1);
1490 auto *I2 = dyn_cast<Instruction>(V2);
1492 if (I1->getParent() != I2->getParent())
1500 if (S.getOpcode() &&
1501 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1502 !S.isAltShuffle()) &&
1504 return cast<Instruction>(V)->getNumOperands() ==
1505 S.MainOp->getNumOperands();
1511 if (isa<UndefValue>(V2))
1548 int ShallowScoreAtThisLevel =
1557 auto *I1 = dyn_cast<Instruction>(
LHS);
1558 auto *I2 = dyn_cast<Instruction>(
RHS);
1559 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1561 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1562 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1563 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1564 ShallowScoreAtThisLevel))
1565 return ShallowScoreAtThisLevel;
1566 assert(I1 && I2 &&
"Should have early exited.");
1573 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1574 OpIdx1 != NumOperands1; ++OpIdx1) {
1576 int MaxTmpScore = 0;
1577 unsigned MaxOpIdx2 = 0;
1578 bool FoundBest =
false;
1582 ? I2->getNumOperands()
1583 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1584 assert(FromIdx <= ToIdx &&
"Bad index");
1585 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1587 if (Op2Used.
count(OpIdx2))
1592 I1, I2, CurrLevel + 1, std::nullopt);
1595 TmpScore > MaxTmpScore) {
1596 MaxTmpScore = TmpScore;
1603 Op2Used.
insert(MaxOpIdx2);
1604 ShallowScoreAtThisLevel += MaxTmpScore;
1607 return ShallowScoreAtThisLevel;
1638 struct OperandData {
1639 OperandData() =
default;
1640 OperandData(
Value *V,
bool APO,
bool IsUsed)
1641 : V(V), APO(APO), IsUsed(IsUsed) {}
1651 bool IsUsed =
false;
1660 enum class ReorderingMode {
1679 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1680 return OpsVec[OpIdx][Lane];
1684 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1685 return OpsVec[OpIdx][Lane];
1690 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1691 OpIdx != NumOperands; ++OpIdx)
1692 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1694 OpsVec[OpIdx][Lane].IsUsed =
false;
1698 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1699 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1711 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1712 Value *IdxLaneV = getData(
Idx, Lane).V;
1713 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1716 for (
unsigned Ln = 0,
E = getNumLanes(); Ln <
E; ++Ln) {
1719 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1720 if (!isa<Instruction>(OpIdxLnV))
1722 Uniques.
insert(OpIdxLnV);
1724 int UniquesCount = Uniques.
size();
1725 int UniquesCntWithIdxLaneV =
1726 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1727 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1728 int UniquesCntWithOpIdxLaneV =
1729 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1730 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1733 UniquesCntWithOpIdxLaneV) -
1734 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1743 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1744 Value *IdxLaneV = getData(
Idx, Lane).V;
1745 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1754 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1755 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1757 return R.areAllUsersVectorized(IdxLaneI, std::nullopt)
1765 static const int ScoreScaleFactor = 10;
1773 int Lane,
unsigned OpIdx,
unsigned Idx,
1783 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1784 if (Score <= -SplatScore) {
1789 Score += SplatScore;
1795 Score *= ScoreScaleFactor;
1796 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1814 std::optional<unsigned>
1815 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1818 unsigned NumOperands = getNumOperands();
1821 Value *OpLastLane = getData(OpIdx, LastLane).V;
1824 ReorderingMode RMode = ReorderingModes[OpIdx];
1825 if (RMode == ReorderingMode::Failed)
1826 return std::nullopt;
1829 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1835 std::optional<unsigned>
Idx;
1839 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1846 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1848 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1850 OperandData &OpData = getData(
Idx, Lane);
1851 Value *Op = OpData.V;
1852 bool OpAPO = OpData.APO;
1861 if (OpAPO != OpIdxAPO)
1866 case ReorderingMode::Load:
1867 case ReorderingMode::Constant:
1868 case ReorderingMode::Opcode: {
1869 bool LeftToRight = Lane > LastLane;
1870 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1871 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1872 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1873 OpIdx,
Idx, IsUsed);
1874 if (Score >
static_cast<int>(BestOp.Score)) {
1876 BestOp.Score = Score;
1877 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1881 case ReorderingMode::Splat:
1882 if (Op == OpLastLane)
1885 case ReorderingMode::Failed:
1891 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1895 return std::nullopt;
1902 unsigned getBestLaneToStartReordering()
const {
1903 unsigned Min = UINT_MAX;
1904 unsigned SameOpNumber = 0;
1915 for (
int I = getNumLanes();
I > 0; --
I) {
1916 unsigned Lane =
I - 1;
1917 OperandsOrderData NumFreeOpsHash =
1918 getMaxNumOperandsThatCanBeReordered(Lane);
1921 if (NumFreeOpsHash.NumOfAPOs < Min) {
1922 Min = NumFreeOpsHash.NumOfAPOs;
1923 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1925 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1926 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1927 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1930 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1931 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1932 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1933 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1934 auto It = HashMap.
find(NumFreeOpsHash.Hash);
1935 if (It == HashMap.
end())
1936 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1942 unsigned BestLane = 0;
1943 unsigned CntMin = UINT_MAX;
1945 if (
Data.second.first < CntMin) {
1946 CntMin =
Data.second.first;
1947 BestLane =
Data.second.second;
1954 struct OperandsOrderData {
1957 unsigned NumOfAPOs = UINT_MAX;
1960 unsigned NumOpsWithSameOpcodeParent = 0;
1974 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1975 unsigned CntTrue = 0;
1976 unsigned NumOperands = getNumOperands();
1986 bool AllUndefs =
true;
1987 unsigned NumOpsWithSameOpcodeParent = 0;
1991 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1992 const OperandData &OpData = getData(OpIdx, Lane);
1997 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1999 I->getParent() != Parent) {
2000 if (NumOpsWithSameOpcodeParent == 0) {
2001 NumOpsWithSameOpcodeParent = 1;
2003 Parent =
I->getParent();
2005 --NumOpsWithSameOpcodeParent;
2008 ++NumOpsWithSameOpcodeParent;
2012 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2013 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2017 OperandsOrderData
Data;
2018 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2019 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2027 assert((empty() || VL.
size() == getNumLanes()) &&
2028 "Expected same number of lanes");
2029 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2030 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2031 OpsVec.
resize(NumOperands);
2032 unsigned NumLanes = VL.
size();
2033 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2034 OpsVec[OpIdx].
resize(NumLanes);
2035 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2036 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2047 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2048 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2049 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2056 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2059 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2062 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2063 return getData(OpIdx, Lane).V;
2067 bool empty()
const {
return OpsVec.
empty(); }
2070 void clear() { OpsVec.
clear(); }
2075 bool shouldBroadcast(
Value *Op,
unsigned OpIdx,
unsigned Lane) {
2076 bool OpAPO = getData(OpIdx, Lane).APO;
2077 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2081 bool FoundCandidate =
false;
2082 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2083 OperandData &
Data = getData(OpI, Ln);
2084 if (
Data.APO != OpAPO ||
Data.IsUsed)
2087 FoundCandidate =
true;
2092 if (!FoundCandidate)
2102 : TLI(TLI),
DL(
DL), SE(SE), R(R) {
2104 appendOperandsOfVL(RootVL);
2111 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2112 "Expected same num of lanes across all operands");
2113 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2114 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2122 unsigned NumOperands = getNumOperands();
2123 unsigned NumLanes = getNumLanes();
2143 unsigned FirstLane = getBestLaneToStartReordering();
2146 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2147 Value *OpLane0 = getValue(OpIdx, FirstLane);
2150 if (isa<LoadInst>(OpLane0))
2151 ReorderingModes[OpIdx] = ReorderingMode::Load;
2152 else if (isa<Instruction>(OpLane0)) {
2154 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2155 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2157 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2159 else if (isa<Constant>(OpLane0))
2160 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2161 else if (isa<Argument>(OpLane0))
2163 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2166 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2173 auto &&SkipReordering = [
this]() {
2176 for (
const OperandData &
Data : Op0)
2179 if (
any_of(Op, [&UniqueValues](
const OperandData &
Data) {
2198 if (SkipReordering())
2201 bool StrategyFailed =
false;
2209 for (
unsigned I = 0;
I < NumOperands; ++
I)
2210 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2212 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2215 int Lane = FirstLane +
Direction * Distance;
2216 if (Lane < 0 || Lane >= (
int)NumLanes)
2219 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2222 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2224 std::optional<unsigned> BestIdx = getBestOperand(
2225 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2232 swap(OpIdx, *BestIdx, Lane);
2235 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2237 StrategyFailed =
true;
2240 if (MainAltOps[OpIdx].
size() != 2) {
2241 OperandData &AltOp = getData(OpIdx, Lane);
2242 InstructionsState OpS =
2244 if (OpS.getOpcode() && OpS.isAltShuffle())
2251 if (!StrategyFailed)
2256#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2259 case ReorderingMode::Load:
2261 case ReorderingMode::Opcode:
2263 case ReorderingMode::Constant:
2265 case ReorderingMode::Splat:
2267 case ReorderingMode::Failed:
2288 const unsigned Indent = 2;
2291 OS <<
"Operand " << Cnt++ <<
"\n";
2292 for (
const OperandData &OpData : OpDataVec) {
2294 if (
Value *V = OpData.V)
2298 OS <<
", APO:" << OpData.APO <<
"}\n";
2320 int BestScore = Limit;
2321 std::optional<int>
Index;
2322 for (
int I : seq<int>(0, Candidates.size())) {
2324 Candidates[
I].second,
2327 if (Score > BestScore) {
2342 DeletedInstructions.insert(
I);
2348 return AnalyzedReductionsRoots.count(
I);
2353 AnalyzedReductionsRoots.insert(
I);
2367 AnalyzedReductionsRoots.clear();
2368 AnalyzedReductionVals.
clear();
2389 canReorderOperands(TreeEntry *UserTE,
2396 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2400 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2402 TreeEntry *TE =
nullptr;
2404 TE = getTreeEntry(V);
2407 if (It != VL.
end() && TE->isSame(VL))
2414 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2415 unsigned OpIdx)
const {
2416 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2417 const_cast<TreeEntry *
>(UserTE), OpIdx);
2435 const EdgeInfo &EI);
2450 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
2455 Value *createBuildVector(
const TreeEntry *
E);
2462 const APInt &ShuffledIndices,
2463 bool NeedToShuffle)
const;
2469 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
2478 std::optional<TargetTransformInfo::ShuffleKind>
2490 void setInsertPointAfterBundle(
const TreeEntry *
E);
2498 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2502 static void reorderInputsAccordingToOpcode(
2511 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2527 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2531 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2540 [Scalars](
Value *V,
int Idx) {
2541 return (isa<UndefValue>(V) &&
2542 Idx == UndefMaskElem) ||
2543 (Idx != UndefMaskElem && V == Scalars[Idx]);
2546 if (!ReorderIndices.empty()) {
2553 return IsSame(Scalars, Mask);
2554 if (VL.
size() == ReuseShuffleIndices.size()) {
2556 return IsSame(Scalars, Mask);
2560 return IsSame(Scalars, ReuseShuffleIndices);
2563 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2564 return State == TreeEntry::NeedToGather &&
2565 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2566 UserTreeIndices.front().UserTE == UserEI.UserTE;
2570 bool hasEqualOperands(
const TreeEntry &TE)
const {
2571 if (
TE.getNumOperands() != getNumOperands())
2574 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
2575 unsigned PrevCount =
Used.count();
2576 for (
unsigned K = 0;
K <
E; ++
K) {
2579 if (getOperand(K) ==
TE.getOperand(
I)) {
2585 if (PrevCount ==
Used.count())
2594 unsigned getVectorFactor()
const {
2595 if (!ReuseShuffleIndices.empty())
2596 return ReuseShuffleIndices.size();
2597 return Scalars.
size();
2604 Value *VectorizedValue =
nullptr;
2609 enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
2624 VecTreeTy &Container;
2648 assert(Operands[OpIdx].empty() &&
"Already resized?");
2650 "Number of operands is greater than the number of scalars.");
2656 void setOperandsInOrder() {
2658 auto *I0 = cast<Instruction>(Scalars[0]);
2659 Operands.resize(I0->getNumOperands());
2660 unsigned NumLanes = Scalars.size();
2661 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2662 OpIdx != NumOperands; ++OpIdx) {
2664 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2665 auto *
I = cast<Instruction>(Scalars[Lane]);
2666 assert(
I->getNumOperands() == NumOperands &&
2667 "Expected same number of operands");
2668 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2692 unsigned getNumOperands()
const {
return Operands.size(); }
2695 Value *getSingleOperand(
unsigned OpIdx)
const {
2697 assert(!Operands[OpIdx].empty() &&
"No operand available");
2702 bool isAltShuffle()
const {
return MainOp != AltOp; }
2705 unsigned CheckedOpcode =
I->getOpcode();
2706 return (getOpcode() == CheckedOpcode ||
2707 getAltOpcode() == CheckedOpcode);
2714 auto *
I = dyn_cast<Instruction>(Op);
2715 if (
I && isOpcodeOrAlt(
I))
2720 void setOperations(
const InstructionsState &S) {
2734 unsigned getOpcode()
const {
2735 return MainOp ? MainOp->
getOpcode() : 0;
2738 unsigned getAltOpcode()
const {
2744 int findLaneForValue(
Value *V)
const {
2745 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2746 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2747 if (!ReorderIndices.
empty())
2748 FoundLane = ReorderIndices[FoundLane];
2749 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2750 if (!ReuseShuffleIndices.
empty()) {
2751 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2752 find(ReuseShuffleIndices, FoundLane));
2761 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2762 dbgs() <<
"Operand " << OpI <<
":\n";
2763 for (
const Value *V : Operands[OpI])
2766 dbgs() <<
"Scalars: \n";
2767 for (
Value *V : Scalars)
2769 dbgs() <<
"State: ";
2772 dbgs() <<
"Vectorize\n";
2774 case ScatterVectorize:
2775 dbgs() <<
"ScatterVectorize\n";
2778 dbgs() <<
"NeedToGather\n";
2781 dbgs() <<
"MainOp: ";
2783 dbgs() << *MainOp <<
"\n";
2786 dbgs() <<
"AltOp: ";
2788 dbgs() << *AltOp <<
"\n";
2791 dbgs() <<
"VectorizedValue: ";
2792 if (VectorizedValue)
2793 dbgs() << *VectorizedValue <<
"\n";
2796 dbgs() <<
"ReuseShuffleIndices: ";
2797 if (ReuseShuffleIndices.
empty())
2800 for (
int ReuseIdx : ReuseShuffleIndices)
2801 dbgs() << ReuseIdx <<
", ";
2803 dbgs() <<
"ReorderIndices: ";
2804 for (
unsigned ReorderIdx : ReorderIndices)
2805 dbgs() << ReorderIdx <<
", ";
2807 dbgs() <<
"UserTreeIndices: ";
2808 for (
const auto &EInfo : UserTreeIndices)
2809 dbgs() << EInfo <<
", ";
2819 dbgs() <<
"SLP: " << Banner <<
":\n";
2821 dbgs() <<
"SLP: Costs:\n";
2822 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2823 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2824 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2825 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2826 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2832 std::optional<ScheduleData *> Bundle,
2833 const InstructionsState &S,
2834 const EdgeInfo &UserTreeIdx,
2837 TreeEntry::EntryState EntryState =
2838 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2839 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2840 ReuseShuffleIndices, ReorderIndices);
2844 TreeEntry::EntryState EntryState,
2845 std::optional<ScheduleData *> Bundle,
2846 const InstructionsState &S,
2847 const EdgeInfo &UserTreeIdx,
2850 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2851 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2852 "Need to vectorize gather entry?");
2853 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2854 TreeEntry *
Last = VectorizableTree.
back().get();
2855 Last->Idx = VectorizableTree.
size() - 1;
2856 Last->State = EntryState;
2857 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2858 ReuseShuffleIndices.end());
2859 if (ReorderIndices.
empty()) {
2861 Last->setOperations(S);
2864 Last->Scalars.assign(VL.
size(),
nullptr);
2867 if (Idx >= VL.size())
2868 return UndefValue::get(VL.front()->getType());
2872 Last->setOperations(S);
2873 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2875 if (
Last->State != TreeEntry::NeedToGather) {
2876 for (
Value *V : VL) {
2877 assert(!getTreeEntry(V) &&
"Scalar already in tree!");
2878 ScalarToTreeEntry[
V] =
Last;
2881 ScheduleData *BundleMember = *Bundle;
2882 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2885 "Bundle and VL out of sync");
2887 for (
Value *V : VL) {
2890 assert(BundleMember &&
"Unexpected end of bundle.");
2891 BundleMember->TE =
Last;
2892 BundleMember = BundleMember->NextInBundle;
2895 assert(!BundleMember &&
"Bundle and VL out of sync");
2897 MustGather.
insert(VL.begin(), VL.end());
2900 if (UserTreeIdx.UserTE)
2901 Last->UserTreeIndices.push_back(UserTreeIdx);
2908 TreeEntry::VecTreeTy VectorizableTree;
2913 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
2914 VectorizableTree[
Id]->dump();
2920 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
2922 const TreeEntry *getTreeEntry(
Value *V)
const {
2923 return ScalarToTreeEntry.lookup(V);
2949 struct ExternalUser {
2971 AliasCacheKey key = std::make_pair(Inst1, Inst2);
2972 std::optional<bool> &result = AliasCache[key];
2976 bool aliased =
true;
2984 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3012 UserList ExternalUses;
3028 struct ScheduleData {
3031 enum { InvalidDeps = -1 };
3033 ScheduleData() =
default;
3035 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3036 FirstInBundle =
this;
3037 NextInBundle =
nullptr;
3038 NextLoadStore =
nullptr;
3039 IsScheduled =
false;
3040 SchedulingRegionID = BlockSchedulingRegionID;
3041 clearDependencies();
3048 if (hasValidDependencies()) {
3049 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3051 assert(UnscheduledDeps == Dependencies &&
"invariant");
3055 assert(isSchedulingEntity() &&
3056 "unexpected scheduled state");
3057 for (
const ScheduleData *BundleMember =
this; BundleMember;
3058 BundleMember = BundleMember->NextInBundle) {
3059 assert(BundleMember->hasValidDependencies() &&
3060 BundleMember->UnscheduledDeps == 0 &&
3061 "unexpected scheduled state");
3062 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3063 "only bundle is marked scheduled");
3067 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3068 "all bundle members must be in same basic block");
3074 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3078 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3082 bool isPartOfBundle()
const {
3083 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3088 bool isReady()
const {
3089 assert(isSchedulingEntity() &&
3090 "can't consider non-scheduling entity for ready list");
3091 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3097 int incrementUnscheduledDeps(
int Incr) {
3098 assert(hasValidDependencies() &&
3099 "increment of unscheduled deps would be meaningless");
3100 UnscheduledDeps += Incr;
3101 return FirstInBundle->unscheduledDepsInBundle();
3106 void resetUnscheduledDeps() {
3107 UnscheduledDeps = Dependencies;
3111 void clearDependencies() {
3112 Dependencies = InvalidDeps;
3113 resetUnscheduledDeps();
3114 MemoryDependencies.clear();
3115 ControlDependencies.clear();
3118 int unscheduledDepsInBundle()
const {
3119 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3121 for (
const ScheduleData *BundleMember =
this; BundleMember;
3122 BundleMember = BundleMember->NextInBundle) {
3123 if (BundleMember->UnscheduledDeps == InvalidDeps)
3125 Sum += BundleMember->UnscheduledDeps;
3131 if (!isSchedulingEntity()) {
3132 os <<
"/ " << *Inst;
3133 }
else if (NextInBundle) {
3135 ScheduleData *SD = NextInBundle;
3137 os <<
';' << *SD->Inst;
3138 SD = SD->NextInBundle;
3149 Value *OpValue =
nullptr;
3152 TreeEntry *
TE =
nullptr;
3156 ScheduleData *FirstInBundle =
nullptr;
3160 ScheduleData *NextInBundle =
nullptr;
3164 ScheduleData *NextLoadStore =
nullptr;
3178 int SchedulingRegionID = 0;
3181 int SchedulingPriority = 0;
3187 int Dependencies = InvalidDeps;
3193 int UnscheduledDeps = InvalidDeps;
3197 bool IsScheduled =
false;
3202 const BoUpSLP::ScheduleData &SD) {
3227 struct BlockScheduling {
3229 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3233 ScheduleStart =
nullptr;
3234 ScheduleEnd =
nullptr;
3235 FirstLoadStoreInRegion =
nullptr;
3236 LastLoadStoreInRegion =
nullptr;
3237 RegionHasStackSave =
false;
3241 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3244 ScheduleRegionSize = 0;
3248 ++SchedulingRegionID;
3252 if (BB !=
I->getParent())
3255 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3256 if (SD && isInSchedulingRegion(SD))
3261 ScheduleData *getScheduleData(
Value *V) {
3262 if (
auto *
I = dyn_cast<Instruction>(V))
3263 return getScheduleData(
I);
3267 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3269 return getScheduleData(V);
3270 auto I = ExtraScheduleDataMap.find(V);
3271 if (
I != ExtraScheduleDataMap.end()) {
3272 ScheduleData *SD =
I->second.lookup(Key);
3273 if (SD && isInSchedulingRegion(SD))
3279 bool isInSchedulingRegion(ScheduleData *SD)
const {
3280 return SD->SchedulingRegionID == SchedulingRegionID;
3285 template <
typename ReadyListType>
3286 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3287 SD->IsScheduled =
true;
3290 for (ScheduleData *BundleMember = SD; BundleMember;
3291 BundleMember = BundleMember->NextInBundle) {
3292 if (BundleMember->Inst != BundleMember->OpValue)
3298 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3299 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3300 if (OpDef && OpDef->hasValidDependencies() &&
3301 OpDef->incrementUnscheduledDeps(-1) == 0) {
3305 ScheduleData *DepBundle = OpDef->FirstInBundle;
3306 assert(!DepBundle->IsScheduled &&
3307 "already scheduled bundle gets ready");
3308 ReadyList.insert(DepBundle);
3310 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3318 if (TreeEntry *TE = BundleMember->TE) {
3320 int Lane = std::distance(
TE->Scalars.begin(),
3321 find(
TE->Scalars, BundleMember->Inst));
3322 assert(Lane >= 0 &&
"Lane not set");
3330 auto *
In = BundleMember->Inst;
3332 (isa<ExtractValueInst, ExtractElementInst>(In) ||
3333 In->getNumOperands() ==
TE->getNumOperands()) &&
3334 "Missed TreeEntry operands?");
3337 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3338 OpIdx != NumOperands; ++OpIdx)
3339 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3344 for (
Use &U : BundleMember->Inst->operands())
3345 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3349 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3350 if (MemoryDepSD->hasValidDependencies() &&
3351 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3354 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3355 assert(!DepBundle->IsScheduled &&
3356 "already scheduled bundle gets ready");
3357 ReadyList.insert(DepBundle);
3359 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3363 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3364 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3367 ScheduleData *DepBundle = DepSD->FirstInBundle;
3368 assert(!DepBundle->IsScheduled &&
3369 "already scheduled bundle gets ready");
3370 ReadyList.insert(DepBundle);
3372 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3384 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3385 ScheduleStart->comesBefore(ScheduleEnd) &&
3386 "Not a valid scheduling region?");
3388 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3389 auto *SD = getScheduleData(
I);
3392 assert(isInSchedulingRegion(SD) &&
3393 "primary schedule data not in window?");
3394 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3395 "entire bundle in window!");
3397 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3400 for (
auto *SD : ReadyInsts) {
3401 assert(SD->isSchedulingEntity() && SD->isReady() &&
3402 "item in ready list not ready?");
3407 void doForAllOpcodes(
Value *V,
3409 if (ScheduleData *SD = getScheduleData(V))
3411 auto I = ExtraScheduleDataMap.find(V);
3412 if (
I != ExtraScheduleDataMap.end())
3413 for (
auto &
P :
I->second)
3414 if (isInSchedulingRegion(
P.second))
3419 template <
typename ReadyListType>
3420 void initialFillReadyList(ReadyListType &ReadyList) {
3421 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3422 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3423 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3425 ReadyList.insert(SD);
3427 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3442 std::optional<ScheduleData *>
3444 const InstructionsState &S);
3450 ScheduleData *allocateScheduleDataChunks();
3454 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3459 ScheduleData *PrevLoadStore,
3460 ScheduleData *NextLoadStore);
3464 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3468 void resetSchedule();
3473 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3489 ExtraScheduleDataMap;
3502 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3506 ScheduleData *LastLoadStoreInRegion =
nullptr;
3511 bool RegionHasStackSave =
false;
3514 int ScheduleRegionSize = 0;
3523 int SchedulingRegionID = 1;
3531 void scheduleBlock(BlockScheduling *BS);
3538 struct OrdersTypeDenseMapInfo {
3551 static unsigned getHashValue(
const OrdersType &V) {
3572 unsigned MaxVecRegSize;
3573 unsigned MinVecRegSize;
3598 struct ChildIteratorType
3600 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3611 return R.VectorizableTree[0].get();
3615 return {
N->UserTreeIndices.begin(),
N->Container};
3619 return {
N->UserTreeIndices.end(),
N->Container};
3624 class nodes_iterator {
3635 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3639 return nodes_iterator(R->VectorizableTree.begin());
3643 return nodes_iterator(R->VectorizableTree.end());
3646 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3657 OS << Entry->Idx <<
".\n";
3660 for (
auto *V : Entry->Scalars) {
3662 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3663 return EU.Scalar == V;
3673 if (Entry->State == TreeEntry::NeedToGather)
3675 if (Entry->State == TreeEntry::ScatterVectorize)
3676 return "color=blue";
3685 for (
auto *
I : DeletedInstructions) {
3686 for (
Use &U :
I->operands()) {
3687 auto *Op = dyn_cast<Instruction>(U.get());
3688 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3692 I->dropAllReferences();
3694 for (
auto *
I : DeletedInstructions) {
3696 "trying to erase instruction with users.");
3697 I->eraseFromParent();
3703#ifdef EXPENSIVE_CHECKS
3714 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3715 "Expected non-empty mask.");
3718 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3720 Reuses[Mask[
I]] = Prev[
I];
3728 assert(!Mask.empty() &&
"Expected non-empty mask.");
3730 if (Order.
empty()) {
3731 MaskOrder.
resize(Mask.size());
3732 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3741 Order.
assign(Mask.size(), Mask.size());
3742 for (
unsigned I = 0,
E = Mask.size();
I <
E; ++
I)
3744 Order[MaskOrder[
I]] =
I;
3748std::optional<BoUpSLP::OrdersType>
3750 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3751 unsigned NumScalars = TE.Scalars.size();
3752 OrdersType CurrentOrder(NumScalars, NumScalars);
3755 const TreeEntry *STE =
nullptr;
3759 for (
unsigned I = 0;
I < NumScalars; ++
I) {
3760 Value *V = TE.Scalars[
I];
3761 if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
3763 if (
const auto *LocalSTE = getTreeEntry(V)) {
3766 else if (STE != LocalSTE)
3768 return std::nullopt;
3770 std::distance(STE->Scalars.begin(),
find(STE->Scalars, V));
3771 if (Lane >= NumScalars)
3772 return std::nullopt;
3773 if (CurrentOrder[Lane] != NumScalars) {
3776 UsedPositions.
reset(CurrentOrder[Lane]);
3780 CurrentOrder[Lane] =
I;
3781 UsedPositions.
set(
I);
3786 if (STE && (UsedPositions.
count() > 1 || STE->Scalars.size() == 2)) {
3788 for (
unsigned I = 0;
I < NumScalars; ++
I)
3789 if (CurrentOrder[
I] !=
I && CurrentOrder[
I] != NumScalars)
3793 if (IsIdentityOrder(CurrentOrder))
3795 auto *It = CurrentOrder.
begin();
3796 for (
unsigned I = 0;
I < NumScalars;) {
3797 if (UsedPositions.
test(
I)) {
3801 if (*It == NumScalars) {
3807 return std::move(CurrentOrder);
3809 return std::nullopt;
3814enum class LoadsState { Gather, Vectorize, ScatterVectorize };
3819 bool CompareOpcodes =
true) {
3822 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
3825 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
3828 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
3832 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
3852 if (
DL.getTypeSizeInBits(ScalarTy) !=
DL.getTypeAllocSizeInBits(ScalarTy))
3853 return LoadsState::Gather;
3859 auto *POIter = PointerOps.
begin();
3860 for (
Value *V : VL) {
3861 auto *L = cast<LoadInst>(V);
3863 return LoadsState::Gather;
3864 *POIter = L->getPointerOperand();
3877 if (Order.
empty()) {
3878 Ptr0 = PointerOps.
front();
3879 PtrN = PointerOps.
back();
3881 Ptr0 = PointerOps[Order.
front()];
3882 PtrN = PointerOps[Order.
back()];
3884 std::optional<int> Diff =
3887 if (
static_cast<unsigned>(*Diff) == VL.
size() - 1)
3888 return LoadsState::Vectorize;
3894 bool ProfitableGatherPointers =
3896 return L && L->isLoopInvariant(V);
3897 })) <= VL.
size() / 2 && VL.
size() > 2;
3898 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
3899 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
3901 (
GEP &&
GEP->getNumOperands() == 2);
3903 Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
3906 std::min(CommonAlignment, cast<LoadInst>(V)->
getAlign());
3910 return LoadsState::ScatterVectorize;
3914 return LoadsState::Gather;
3921 VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
3922 "Expected list of pointer operands.");
3927 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
3932 std::optional<int> Diff =
3938 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
3944 if (Bases.
size() > VL.
size() / 2 - 1)
3948 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
3954 bool AnyConsecutive =
false;
3955 for (
auto &
Base : Bases) {
3956 auto &Vec =
Base.second;
3957 if (Vec.size() > 1) {
3959 const std::tuple<Value *, int, unsigned> &
Y) {
3960 return std::get<1>(
X) < std::get<1>(
Y);
3962 int InitialOffset = std::get<1>(Vec[0]);
3964 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
3970 SortedIndices.
clear();
3971 if (!AnyConsecutive)
3974 for (
auto &
Base : Bases) {
3975 for (
auto &
T :
Base.second)
3980 "Expected SortedIndices to be the size of VL");
3984std::optional<BoUpSLP::OrdersType>
3986 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3987 Type *ScalarTy = TE.Scalars[0]->getType();
3990 Ptrs.
reserve(TE.Scalars.size());
3991 for (
Value *V : TE.Scalars) {
3992 auto *L = dyn_cast<LoadInst>(V);
3993 if (!L || !L->isSimple())
3994 return std::nullopt;
4000 return std::move(Order);
4001 return std::nullopt;
4012 if (VU->
getType() != V->getType())
4015 if (!VU->
hasOneUse() && !V->hasOneUse())
4021 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4027 bool IsReusedIdx =
false;
4029 if (IE2 == VU && !IE1)
4031 if (IE1 == V && !IE2)
4032 return V->hasOneUse();
4033 if (IE1 && IE1 != V) {
4036 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4039 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4041 if (IE2 && IE2 != VU) {
4044 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4047 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4049 }
while (!IsReusedIdx && (IE1 || IE2));
4053std::optional<BoUpSLP::OrdersType>
4057 if (!TE.ReuseShuffleIndices.empty()) {
4065 unsigned Sz = TE.Scalars.size();
4068 return std::nullopt;
4069 unsigned VF = TE.getVectorFactor();
4072 TE.ReuseShuffleIndices.end());
4073 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4075 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4076 return Idx && *Idx < Sz;
4079 if (TE.ReorderIndices.empty())
4080 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4083 for (
unsigned I = 0;
I < VF; ++
I) {
4084 int &
Idx = ReusedMask[
I];
4087 Value *V = TE.Scalars[ReorderMask[
Idx]];
4089 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4095 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4096 auto *It = ResOrder.
begin();
4097 for (
unsigned K = 0; K < VF; K += Sz) {
4101 std::iota(SubMask.begin(), SubMask.end(), 0);
4103 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4104 std::advance(It, Sz);
4107 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4108 return std::nullopt;
4109 return std::move(ResOrder);
4111 if (TE.State == TreeEntry::Vectorize &&
4112 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4113 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4115 return TE.ReorderIndices;
4116 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4118 if (!V1->
hasOneUse() || !V2->hasOneUse())
4120 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4121 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4122 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4123 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4130 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4132 return *Idx1 < *Idx2;
4134 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4135 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4136 if (EE1->getOperand(0) != EE2->getOperand(0))
4140 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4142 return *Idx1 < *Idx2;
4146 auto IsIdentityOrder = [](
const OrdersType &Order) {
4147 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4152 if (!TE.ReorderIndices.empty())
4153 return TE.ReorderIndices;
4157 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id) {
4158 PhiToId[TE.Scalars[Id]] = Id;
4159 Phis.push_back(TE.Scalars[Id]);
4162 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4163 ResOrder[Id] = PhiToId[Phis[Id]];
4164 if (IsIdentityOrder(ResOrder))
4165 return std::nullopt;
4166 return std::move(ResOrder);
4168 if (TE.State == TreeEntry::NeedToGather) {
4171 if (((TE.getOpcode() == Instruction::ExtractElement &&
4172 !TE.isAltShuffle()) ||
4175 return isa<UndefValue, ExtractElementInst>(V);
4178 [](
Value *V) { return isa<ExtractElementInst>(V); }))) &&
4181 auto *EE = dyn_cast<ExtractElementInst>(V);
4182 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4188 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);
4189 if (Reuse || !CurrentOrder.
empty()) {
4190 if (!CurrentOrder.
empty())
4192 return std::move(CurrentOrder);
4201 int Sz = TE.Scalars.size();
4205 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4206 if (It == TE.Scalars.begin())
4209 if (It != TE.Scalars.end()) {
4211 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4226 if (InsertFirstCost + PermuteCost < InsertIdxCost)
4227 return std::move(Order);
4231 return CurrentOrder;
4232 if (TE.Scalars.size() >= 4)
4236 return std::nullopt;
4246 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4248 if (Cluster != FirstCluster)
4254void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4257 const unsigned Sz =
TE.Scalars.size();
4259 if (
TE.State != TreeEntry::NeedToGather ||
4266 addMask(NewMask,
TE.ReuseShuffleIndices);
4268 TE.ReorderIndices.clear();
4275 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4276 *End =
TE.ReuseShuffleIndices.end();
4277 It != End; std::advance(It, Sz))
4278 std::iota(It, std::next(It, Sz), 0);
4297 ExternalUserReorderMap;
4303 for_each(VectorizableTree, [
this, &TTIRef, &VFToOrderedEntries,
4304 &GathersToOrders, &ExternalUserReorderMap,
4305 &AltShufflesToOrders, &PhisToOrders](
4306 const std::unique_ptr<TreeEntry> &TE) {
4309 findExternalStoreUsersReorderIndices(TE.get());
4310 if (!ExternalUserReorderIndices.
empty()) {
4311 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4313 std::move(ExternalUserReorderIndices));
4319 if (TE->isAltShuffle()) {
4322 unsigned Opcode0 = TE->getOpcode();
4323 unsigned Opcode1 = TE->getAltOpcode();
4326 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4327 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4328 OpcodeMask.
set(Lane);
4331 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4337 if (std::optional<OrdersType> CurrentOrder =
4347 const TreeEntry *UserTE = TE.get();
4349 if (UserTE->UserTreeIndices.size() != 1)
4352 return EI.UserTE->State == TreeEntry::Vectorize &&
4353 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4356 UserTE = UserTE->UserTreeIndices.back().UserTE;
4359 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4360 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4361 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4362 if (TE->State == TreeEntry::Vectorize &&
4363 TE->getOpcode() == Instruction::PHI)
4364 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
4369 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
4371 auto It = VFToOrderedEntries.
find(VF);
4372 if (It == VFToOrderedEntries.
end())
4384 for (
const TreeEntry *OpTE : OrderedEntries) {
4387 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4390 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
4392 if (OpTE->State == TreeEntry::NeedToGather ||
4393 !OpTE->ReuseShuffleIndices.empty()) {
4394 auto It = GathersToOrders.find(OpTE);
4395 if (It != GathersToOrders.end())
4398 if (OpTE->isAltShuffle()) {
4399 auto It = AltShufflesToOrders.find(OpTE);
4400 if (It != AltShufflesToOrders.end())
4403 if (OpTE->State == TreeEntry::Vectorize &&
4404 OpTE->getOpcode() == Instruction::PHI) {
4405 auto It = PhisToOrders.
find(OpTE);
4406 if (It != PhisToOrders.
end())
4409 return OpTE->ReorderIndices;
4412 auto It = ExternalUserReorderMap.
find(OpTE);
4413 if (It != ExternalUserReorderMap.
end()) {
4414 const auto &ExternalUserReorderIndices = It->second;
4418 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
4419 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
4420 ExternalUserReorderIndices.size();
4422 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
4423 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
4430 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4431 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4434 unsigned E = Order.size();
4437 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4440 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
4442 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
4446 if (OrdersUses.empty())
4450 unsigned Cnt = OrdersUses.front().second;
4451 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4452 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4453 BestOrder = Pair.first;
4458 if (BestOrder.
empty())
4463 unsigned E = BestOrder.
size();
4465 return I < E ? static_cast<int>(I) : UndefMaskElem;
4468 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
4470 if (TE->Scalars.size() != VF) {
4471 if (TE->ReuseShuffleIndices.size() == VF) {
4477 return EI.UserTE->Scalars.size() == VF ||
4478 EI.UserTE->Scalars.size() ==
4481 "All users must be of VF size.");
4484 reorderNodeWithReuses(*TE, Mask);
4488 if (TE->State == TreeEntry::Vectorize &&
4491 !TE->isAltShuffle()) {
4495 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
4496 TE->reorderOperands(Mask);
4499 TE->reorderOperands(Mask);
4500 assert(TE->ReorderIndices.empty() &&
4501 "Expected empty reorder sequence.");
4504 if (!TE->ReuseShuffleIndices.empty()) {
4511 addMask(NewReuses, TE->ReuseShuffleIndices);
4512 TE->ReuseShuffleIndices.swap(NewReuses);
4518bool BoUpSLP::canReorderOperands(
4519 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4522 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
4523 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
4524 return OpData.first ==
I &&
4525 OpData.second->State == TreeEntry::Vectorize;
4528 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
4530 if (
any_of(TE->UserTreeIndices,
4531 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
4535 Edges.emplace_back(
I, TE);
4541 if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())
4545 TreeEntry *Gather =
nullptr;
4547 [&Gather, UserTE,
I](TreeEntry *TE) {
4548 assert(TE->State != TreeEntry::Vectorize &&
4549 "Only non-vectorized nodes are expected.");
4550 if (
any_of(TE->UserTreeIndices,
4551 [UserTE,
I](
const EdgeInfo &EI) {
4552 return EI.UserTE == UserTE && EI.EdgeIdx == I;
4554 assert(TE->isSame(UserTE->getOperand(
I)) &&
4555 "Operand entry does not match operands.");
4576 for_each(VectorizableTree, [
this, &OrderedEntries, &GathersToOrders,
4578 const std::unique_ptr<TreeEntry> &TE) {
4579 if (TE->State != TreeEntry::Vectorize)
4581 if (std::optional<OrdersType> CurrentOrder =
4583 OrderedEntries.
insert(TE.get());
4584 if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())
4585 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4594 while (!OrderedEntries.
empty()) {
4599 for (TreeEntry *TE : OrderedEntries) {
4600 if (!(TE->State == TreeEntry::Vectorize ||
4601 (TE->State == TreeEntry::NeedToGather &&
4602 GathersToOrders.
count(TE))) ||
4603 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4606 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
4608 !Visited.insert(TE).second) {
4614 for (
EdgeInfo &EI : TE->UserTreeIndices) {
4615 TreeEntry *UserTE = EI.
UserTE;
4616 auto It =
Users.find(UserTE);
4617 if (It ==
Users.end())
4618 It =
Users.insert({UserTE, {}}).first;
4619 It->second.emplace_back(EI.
EdgeIdx, TE);
4624 [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });
4626 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
4628 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
4629 return Data1.first->Idx > Data2.first->Idx;
4631 for (
auto &
Data : UsersVec) {
4634 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
4637 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4638 OrderedEntries.remove(Op.second);
4652 for (
const auto &Op :
Data.second) {
4653 TreeEntry *OpTE = Op.second;
4654 if (!VisitedOps.
insert(OpTE).second)
4656 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
4658 const auto &Order = [OpTE, &GathersToOrders]() ->
const OrdersType & {
4659 if (OpTE->State == TreeEntry::NeedToGather ||
4660 !OpTE->ReuseShuffleIndices.empty())
4661 return GathersToOrders.
find(OpTE)->second;
4662 return OpTE->ReorderIndices;
4665 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
4666 return P.second == OpTE;
4669 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
4670 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
4673 unsigned E = Order.size();
4676 return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);
4679 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
4682 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
4684 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
4685 const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4686 const TreeEntry *TE) {
4687 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4688 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4689 (IgnoreReorder && TE->Idx == 0))
4691 if (TE->State == TreeEntry::NeedToGather) {
4692 auto It = GathersToOrders.
find(TE);
4693 if (It != GathersToOrders.
end())
4694 return !It->second.empty();
4699 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
4700 TreeEntry *UserTE = EI.
UserTE;
4701 if (!VisitedUsers.
insert(UserTE).second)
4706 if (AllowsReordering(UserTE))
4714 if (
static_cast<unsigned>(
count_if(
4715 Ops, [UserTE, &AllowsReordering](
4716 const std::pair<unsigned, TreeEntry *> &Op) {
4717 return AllowsReordering(Op.second) &&
4718 all_of(Op.second->UserTreeIndices,
4720 return EI.UserTE == UserTE;
4722 })) <= Ops.
size() / 2)
4723 ++Res.first->second;
4727 if (OrdersUses.empty()) {
4729 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4730 OrderedEntries.remove(Op.second);
4736 unsigned Cnt = OrdersUses.front().second;
4737 for (
const auto &Pair :
drop_begin(OrdersUses)) {
4738 if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
4739 BestOrder = Pair.first;
4744 if (BestOrder.
empty()) {
4746 [&OrderedEntries](
const std::pair<unsigned, TreeEntry *> &Op) {
4747 OrderedEntries.remove(Op.second);
4756 unsigned E = BestOrder.
size();
4758 return I < E ? static_cast<int>(I) : UndefMaskElem;
4760 for (
const std::pair<unsigned, TreeEntry *> &Op :
Data.second) {
4761 TreeEntry *TE = Op.second;
4762 OrderedEntries.remove(TE);
4763 if (!VisitedOps.
insert(TE).second)
4765 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
4766 reorderNodeWithReuses(*TE, Mask);
4770 if (TE->State != TreeEntry::Vectorize)
4772 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
4773 TE->ReorderIndices.empty()) &&
4774 "Non-matching sizes of user/operand entries.");
4776 if (IgnoreReorder && TE == VectorizableTree.front().get())
4777 IgnoreReorder =
false;
4780 for (TreeEntry *Gather : GatherOps) {
4781 assert(Gather->ReorderIndices.empty() &&
4782 "Unexpected reordering of gathers.");
4783 if (!Gather->ReuseShuffleIndices.empty()) {
4789 OrderedEntries.remove(Gather);
4793 if (
Data.first->State != TreeEntry::Vectorize ||
4794 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
4795 Data.first->getMainOp()) ||
4796 Data.first->isAltShuffle())
4797 Data.first->reorderOperands(Mask);
4798 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
4799 Data.first->isAltShuffle()) {
4802 if (
Data.first->ReuseShuffleIndices.empty() &&
4803 !
Data.first->ReorderIndices.empty() &&
4804 !
Data.first->isAltShuffle()) {
4807 OrderedEntries.insert(
Data.first);
4815 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
4816 VectorizableTree.front()->ReuseShuffleIndices.empty())
4817 VectorizableTree.front()->ReorderIndices.clear();
4823 for (
auto &TEPtr : VectorizableTree) {
4824 TreeEntry *Entry = TEPtr.get();
4827 if (Entry->State == TreeEntry::NeedToGather)
4831 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
4832 Value *Scalar = Entry->Scalars[Lane];
4833 int FoundLane = Entry->findLaneForValue(Scalar);
4836 auto ExtI = ExternallyUsedValues.
find(Scalar);
4837 if (ExtI != ExternallyUsedValues.
end()) {
4838 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
4839 << Lane <<
" from " << *Scalar <<
".\n");
4840 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
4842 for (
User *U : Scalar->users()) {
4853 if (TreeEntry *UseEntry = getTreeEntry(U)) {
4854 Value *UseScalar = UseEntry->Scalars[0];
4858 if (UseScalar != U ||
4859 UseEntry->State == TreeEntry::ScatterVectorize ||
4861 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
4863 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
4869 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4872 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract:" << *U <<
" from lane "
4873 << Lane <<
" from " << *Scalar <<
".\n");
4874 ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4881BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
4883 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
4884 Value *V = TE->Scalars[Lane];
4886 static constexpr unsigned UsersLimit = 4;
4887 if (V->hasNUsesOrMore(UsersLimit))
4891 for (
User *U : V->users()) {
4892 auto *
SI = dyn_cast<StoreInst>(U);
4893 if (SI ==
nullptr || !
SI->isSimple() ||
4897 if (getTreeEntry(U))
4901 auto &StoresVec = PtrToStoresMap[
Ptr];
4904 if (StoresVec.size() > Lane)
4907 if (!StoresVec.empty() &&
4908 SI->getParent() != StoresVec.back()->getParent())
4911 if (!StoresVec.empty() &&
4912 SI->getValueOperand()->getType() !=
4913 StoresVec.back()->getValueOperand()->getType())
4915 StoresVec.push_back(SI);
4918 return PtrToStoresMap;
4922 OrdersType &ReorderIndices)
const {
4930 StoreOffsetVec[0] = {S0, 0};
4933 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
4935 std::optional<int> Diff =
4937 SI->getPointerOperand(), *DL, *SE,
4942 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
4947 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
4948 const std::pair<StoreInst *, int> &Pair2) {
4949 int Offset1 = Pair1.second;
4950 int Offset2 = Pair2.second;
4951 return Offset1 < Offset2;
4955 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
4956 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx-1].second + 1)
4961 ReorderIndices.reserve(StoresVec.
size());
4964 [SI](
const std::pair<StoreInst *, int> &Pair) {
4965 return Pair.first ==
SI;
4967 StoreOffsetVec.begin();
4968 ReorderIndices.push_back(
Idx);
4973 auto IsIdentityOrder = [](
const OrdersType &Order) {
4974 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4979 if (IsIdentityOrder(ReorderIndices))
4980 ReorderIndices.clear();
4987 for (
unsigned Idx : Order)
4994BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
4995 unsigned NumLanes =
TE->Scalars.size();
4998 collectUserStores(TE);
5007 for (
const auto &Pair : PtrToStoresMap) {
5008 auto &StoresVec = Pair.second;
5010 if (StoresVec.size() != NumLanes)
5015 if (!canFormVector(StoresVec, ReorderIndices))
5020 ExternalReorderIndices.
push_back(ReorderIndices);
5022 return ExternalReorderIndices;
5028 UserIgnoreList = &UserIgnoreLst;
5031 buildTree_rec(Roots, 0,
EdgeInfo());
5038 buildTree_rec(Roots, 0,
EdgeInfo());
5045 Value *NeedsScheduling =
nullptr;
5046 for (
Value *V : VL) {
5049 if (!NeedsScheduling) {
5050 NeedsScheduling = V;
5055 return NeedsScheduling;