73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
111STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 "Controls which SLP graphs should be vectorized.");
118 cl::desc(
"Run the SLP vectorization passes"));
122 cl::desc(
"Enable vectorization for wider vector utilization"));
126 cl::desc(
"Only vectorize if you gain more than this "
131 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
136 cl::desc(
"Attempt to vectorize horizontal reductions"));
141 "Attempt to vectorize horizontal reductions feeding into a store"));
145 cl::desc(
"Attempt to vectorize for this register size in bits"));
149 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
157 cl::desc(
"Limit the size of the SLP scheduling region per block"));
161 cl::desc(
"Attempt to vectorize for this register size in bits"));
165 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
169 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
175 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
184 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
188 cl::desc(
"The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
193 cl::desc(
"The maximum stride, considered to be profitable."));
197 cl::desc(
"Display the SLP trees with Graphviz"));
201 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
232 if (
SLPReVec && isa<FixedVectorType>(Ty))
234 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 if (
auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (
auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (
auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
271 Type *Ty,
unsigned Sz) {
276 if (NumParts == 0 || NumParts >= Sz)
291 if (NumParts == 0 || NumParts >= Sz)
296 return (Sz / RegVF) * RegVF;
306 for (
unsigned I : seq<unsigned>(Mask.size()))
308 I * VecTyNumElements, VecTyNumElements)))
310 : Mask[
I] * VecTyNumElements + J;
341 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
343 auto *SV = cast<ShuffleVectorInst>(VL.
front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
352 unsigned NumGroup = 0;
353 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
355 Value *Src = SV->getOperand(0);
359 auto *SV = cast<ShuffleVectorInst>(V);
361 if (SV->getOperand(0) != Src)
364 if (!SV->isExtractSubvectorMask(Index))
366 ExpectedIndex.
set(Index / ShuffleMaskSize);
370 if (!ExpectedIndex.
all())
374 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
392 auto *SV = cast<ShuffleVectorInst>(VL.
front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 unsigned AccumulateLength = 0;
397 for (
Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (
int M : SV->getShuffleMask())
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
420 auto *
I = dyn_cast<Instruction>(V);
421 if (!
I || isa<ExtractValueInst>(
I))
423 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
425 if (isa<ExtractElementInst>(
I))
427 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
443 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
452 OS <<
"Idx: " <<
Idx <<
", ";
453 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
461 auto *It =
find_if(VL, IsaPred<Instruction>);
470 if (isa<PoisonValue>(V))
472 auto *
II = dyn_cast<Instruction>(V);
476 if (BB !=
II->getParent())
493 Value *FirstNonUndef =
nullptr;
494 for (
Value *V : VL) {
495 if (isa<UndefValue>(V))
497 if (!FirstNonUndef) {
501 if (V != FirstNonUndef)
504 return FirstNonUndef !=
nullptr;
509 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
510 return Cmp->isCommutative();
511 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
532 (BO->getOpcode() == Instruction::FSub &&
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 return I->isCommutative();
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
548 if (
const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
555 if (CI->getValue().uge(VT->getNumElements()))
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
569 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
571 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
576 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
580 Type *CurrentType =
IV->getType();
581 for (
unsigned I :
IV->indices()) {
582 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(
I);
585 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
620 if (MaskArg == UseMask::UndefsAsMask)
624 if (MaskArg == UseMask::FirstArg &&
Value < VF)
625 UseMask.reset(
Value);
626 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
627 UseMask.reset(
Value - VF);
635template <
bool IsPoisonOnly = false>
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
642 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
645 auto *
C = dyn_cast<Constant>(V);
647 if (!UseMask.empty()) {
649 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
651 if (isa<T>(
II->getOperand(1)))
658 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
666 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
673 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
674 if (
Constant *Elem =
C->getAggregateElement(
I))
676 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
704static std::optional<TargetTransformInfo::ShuffleKind>
707 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
711 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
712 auto *EI = dyn_cast<ExtractElementInst>(V);
715 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
718 return std::max(S, VTy->getNumElements());
721 Value *Vec1 =
nullptr;
722 Value *Vec2 =
nullptr;
724 auto *EE = dyn_cast<ExtractElementInst>(V);
727 Value *Vec = EE->getVectorOperand();
728 if (isa<UndefValue>(Vec))
733 ShuffleMode CommonShuffleMode =
Unknown;
735 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
737 if (isa<UndefValue>(VL[
I]))
739 auto *EI = cast<ExtractElementInst>(VL[
I]);
740 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 auto *Vec = EI->getVectorOperand();
744 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
747 if (isa<UndefValue>(Vec)) {
750 if (isa<UndefValue>(EI->getIndexOperand()))
752 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
758 unsigned IntIdx =
Idx->getValue().getZExtValue();
765 if (!Vec1 || Vec1 == Vec) {
767 }
else if (!Vec2 || Vec2 == Vec) {
773 if (CommonShuffleMode == Permute)
777 if (Mask[
I] %
Size !=
I) {
778 CommonShuffleMode = Permute;
781 CommonShuffleMode =
Select;
784 if (CommonShuffleMode ==
Select && Vec2)
795 assert((Opcode == Instruction::ExtractElement ||
796 Opcode == Instruction::ExtractValue) &&
797 "Expected extractelement or extractvalue instruction.");
798 if (Opcode == Instruction::ExtractElement) {
799 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
802 return CI->getZExtValue();
804 auto *EI = cast<ExtractValueInst>(E);
805 if (EI->getNumIndices() != 1)
807 return *EI->idx_begin();
813class InstructionsState {
828 unsigned getAltOpcode()
const {
833 bool isAltShuffle()
const {
return AltOp != MainOp; }
836 unsigned CheckedOpcode =
I->getOpcode();
837 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
840 InstructionsState() =
delete;
842 : MainOp(MainOp), AltOp(AltOp) {}
843 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
869 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
870 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
871 BaseOp0 == Op0 || BaseOp1 == Op1 ||
882 "Assessing comparisons of different types?");
892 return (BasePred == Pred &&
894 (BasePred == SwappedPred &&
904 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
905 return InstructionsState::invalid();
907 auto *It =
find_if(VL, IsaPred<Instruction>);
909 return InstructionsState::invalid();
912 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
913 if ((VL.
size() > 2 && !isa<PHINode>(V) && InstCnt < VL.
size() / 2) ||
914 (VL.
size() == 2 && InstCnt < 2))
915 return InstructionsState::invalid();
917 bool IsCastOp = isa<CastInst>(V);
918 bool IsBinOp = isa<BinaryOperator>(V);
919 bool IsCmpOp = isa<CmpInst>(V);
922 unsigned Opcode = cast<Instruction>(V)->getOpcode();
923 unsigned AltOpcode = Opcode;
924 unsigned AltIndex = std::distance(VL.
begin(), It);
926 bool SwappedPredsCompatible = [&]() {
930 UniquePreds.
insert(BasePred);
931 UniqueNonSwappedPreds.
insert(BasePred);
932 for (
Value *V : VL) {
933 auto *
I = dyn_cast<CmpInst>(V);
939 UniqueNonSwappedPreds.
insert(CurrentPred);
940 if (!UniquePreds.
contains(CurrentPred) &&
941 !UniquePreds.
contains(SwappedCurrentPred))
942 UniquePreds.
insert(CurrentPred);
947 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
951 auto *IBase = cast<Instruction>(V);
954 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
958 return InstructionsState::invalid();
960 bool AnyPoison = InstCnt != VL.
size();
961 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
962 auto *
I = dyn_cast<Instruction>(VL[Cnt]);
969 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
970 return InstructionsState::invalid();
971 unsigned InstOpcode =
I->getOpcode();
972 if (IsBinOp && isa<BinaryOperator>(
I)) {
973 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
977 AltOpcode = InstOpcode;
981 }
else if (IsCastOp && isa<CastInst>(
I)) {
982 Value *Op0 = IBase->getOperand(0);
984 Value *Op1 =
I->getOperand(0);
987 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
989 if (Opcode == AltOpcode) {
992 "Cast isn't safe for alternation, logic needs to be updated!");
993 AltOpcode = InstOpcode;
998 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
999 auto *BaseInst = cast<CmpInst>(V);
1000 Type *Ty0 = BaseInst->getOperand(0)->getType();
1001 Type *Ty1 = Inst->getOperand(0)->getType();
1003 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1004 assert(InstOpcode == AltOpcode &&
1005 "Alternate instructions are only supported by BinaryOperator "
1013 if ((E == 2 || SwappedPredsCompatible) &&
1014 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1019 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1023 }
else if (BasePred != CurrentPred) {
1026 "CmpInst isn't safe for alternation, logic needs to be updated!");
1031 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1032 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1035 }
else if (InstOpcode == Opcode) {
1036 assert(InstOpcode == AltOpcode &&
1037 "Alternate instructions are only supported by BinaryOperator and "
1039 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1040 if (Gep->getNumOperands() != 2 ||
1041 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1042 return InstructionsState::invalid();
1043 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1045 return InstructionsState::invalid();
1046 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1047 auto *BaseLI = cast<LoadInst>(IBase);
1048 if (!LI->isSimple() || !BaseLI->isSimple())
1049 return InstructionsState::invalid();
1050 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1051 auto *
CallBase = cast<CallInst>(IBase);
1053 return InstructionsState::invalid();
1054 if (Call->hasOperandBundles() &&
1056 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1057 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1060 return InstructionsState::invalid();
1063 return InstructionsState::invalid();
1066 if (Mappings.
size() != BaseMappings.
size() ||
1067 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1068 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1069 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1070 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1071 Mappings.
front().Shape.Parameters !=
1072 BaseMappings.
front().Shape.Parameters)
1073 return InstructionsState::invalid();
1078 return InstructionsState::invalid();
1081 return InstructionsState(cast<Instruction>(V),
1082 cast<Instruction>(VL[AltIndex]));
1099 unsigned Opcode = UserInst->
getOpcode();
1101 case Instruction::Load: {
1102 LoadInst *LI = cast<LoadInst>(UserInst);
1105 case Instruction::Store: {
1106 StoreInst *SI = cast<StoreInst>(UserInst);
1107 return (SI->getPointerOperand() == Scalar);
1109 case Instruction::Call: {
1110 CallInst *CI = cast<CallInst>(UserInst);
1113 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1114 Arg.value().get() == Scalar;
1126 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1133 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1134 return LI->isSimple();
1136 return SI->isSimple();
1138 return !
MI->isVolatile();
1146 bool ExtendingManyInputs =
false) {
1147 if (SubMask.
empty())
1150 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1153 "SubMask with many inputs support must be larger than the mask.");
1155 Mask.append(SubMask.
begin(), SubMask.
end());
1159 int TermValue = std::min(Mask.size(), SubMask.
size());
1160 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1162 (!ExtendingManyInputs &&
1163 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1165 NewMask[
I] = Mask[SubMask[
I]];
1181 const unsigned Sz = Order.
size();
1184 for (
unsigned I = 0;
I < Sz; ++
I) {
1186 UnusedIndices.
reset(Order[
I]);
1188 MaskedIndices.
set(
I);
1190 if (MaskedIndices.
none())
1193 "Non-synced masked/available indices.");
1197 assert(
Idx >= 0 &&
"Indices must be synced.");
1208 Type *ScalarTy = VL[0]->getType();
1211 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1212 if (isa<PoisonValue>(VL[Lane]))
1214 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1215 OpcodeMask.
set(Lane * ScalarTyNumElements,
1216 Lane * ScalarTyNumElements + ScalarTyNumElements);
1226 const unsigned E = Indices.
size();
1228 for (
unsigned I = 0;
I < E; ++
I)
1229 Mask[Indices[
I]] =
I;
1235 assert(!Mask.empty() &&
"Expected non-empty mask.");
1239 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1241 Scalars[Mask[
I]] = Prev[
I];
1249 auto *
I = dyn_cast<Instruction>(V);
1254 auto *IO = dyn_cast<Instruction>(V);
1257 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1266 auto *
I = dyn_cast<Instruction>(V);
1270 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1272 auto *IU = dyn_cast<Instruction>(U);
1275 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1291 return !VL.
empty() &&
1307 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1311namespace slpvectorizer {
1316 struct ScheduleData;
1340 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1341 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1392 return !VectorizableTree.
empty() &&
1393 !VectorizableTree.
front()->UserTreeIndices.empty();
1398 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1399 return VectorizableTree.
front()->Scalars;
1405 const TreeEntry &Root = *VectorizableTree.
front().get();
1406 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1407 !Root.Scalars.front()->getType()->isIntegerTy())
1408 return std::nullopt;
1409 auto It = MinBWs.
find(&Root);
1410 if (It != MinBWs.
end())
1414 if (Root.getOpcode() == Instruction::ZExt ||
1415 Root.getOpcode() == Instruction::SExt)
1416 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1417 Root.getOpcode() == Instruction::SExt);
1418 return std::nullopt;
1424 return MinBWs.
at(VectorizableTree.
front().get()).second;
1429 if (ReductionBitWidth == 0 ||
1430 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1431 ReductionBitWidth >=
1432 DL->getTypeSizeInBits(
1433 VectorizableTree.
front()->Scalars.front()->getType()))
1435 VectorizableTree.
front()->Scalars.front()->getType(),
1436 VectorizableTree.
front()->getVectorFactor());
1439 VectorizableTree.
front()->Scalars.front()->getContext(),
1441 VectorizableTree.
front()->getVectorFactor());
1456 VectorizableTree.
clear();
1457 ScalarToTreeEntry.clear();
1458 MultiNodeScalars.clear();
1460 NonScheduledFirst.
clear();
1461 EntryToLastInstruction.clear();
1462 LoadEntriesToVectorize.
clear();
1463 IsGraphTransformMode =
false;
1464 GatheredLoadsEntriesFirst.reset();
1465 ExternalUses.
clear();
1466 ExternalUsesAsOriginalScalar.clear();
1467 for (
auto &Iter : BlocksSchedules) {
1468 BlockScheduling *BS = Iter.second.get();
1472 ReductionBitWidth = 0;
1474 CastMaxMinBWSizes.reset();
1475 ExtraBitWidthNodes.
clear();
1476 InstrElementSize.clear();
1477 UserIgnoreList =
nullptr;
1478 PostponedGathers.
clear();
1479 ValueToGatherNodes.
clear();
1495 assert(!Order.
empty() &&
"expected non-empty order");
1496 const unsigned Sz = Order.
size();
1498 return P.value() ==
P.index() ||
P.value() == Sz;
1551 return MaxVecRegSize;
1556 return MinVecRegSize;
1564 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1566 return MaxVF ? MaxVF : UINT_MAX;
1618 unsigned *BestVF =
nullptr,
1619 bool TryRecursiveCheck =
true)
const;
1627 template <
typename T>
1654 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1655 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1677 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1678 MaxLevel(MaxLevel) {}
1732 if (isa<LoadInst>(V1)) {
1734 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1739 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1741 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1744 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1747 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1749 ((
int)V1->getNumUses() == NumLanes ||
1750 AllUsersAreInternal(V1, V2)))
1756 auto CheckSameEntryOrFail = [&]() {
1757 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1758 TE1 && TE1 == R.getTreeEntry(V2))
1763 auto *LI1 = dyn_cast<LoadInst>(V1);
1764 auto *LI2 = dyn_cast<LoadInst>(V2);
1766 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1768 return CheckSameEntryOrFail();
1771 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1772 LI2->getPointerOperand(),
DL, SE,
true);
1773 if (!Dist || *Dist == 0) {
1776 R.TTI->isLegalMaskedGather(
1779 return CheckSameEntryOrFail();
1783 if (std::abs(*Dist) > NumLanes / 2)
1792 auto *C1 = dyn_cast<Constant>(V1);
1793 auto *C2 = dyn_cast<Constant>(V2);
1807 if (isa<UndefValue>(V2))
1811 Value *EV2 =
nullptr;
1824 int Dist = Idx2 - Idx1;
1827 if (std::abs(Dist) == 0)
1829 if (std::abs(Dist) > NumLanes / 2)
1836 return CheckSameEntryOrFail();
1839 auto *I1 = dyn_cast<Instruction>(V1);
1840 auto *I2 = dyn_cast<Instruction>(V2);
1842 if (I1->getParent() != I2->getParent())
1843 return CheckSameEntryOrFail();
1850 if (S.getOpcode() &&
1851 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1852 !S.isAltShuffle()) &&
1854 return isa<PoisonValue>(V) ||
1855 cast<Instruction>(V)->getNumOperands() ==
1856 S.getMainOp()->getNumOperands();
1862 if (I1 && isa<PoisonValue>(V2))
1865 if (isa<UndefValue>(V2))
1868 return CheckSameEntryOrFail();
1902 int ShallowScoreAtThisLevel =
1911 auto *I1 = dyn_cast<Instruction>(
LHS);
1912 auto *I2 = dyn_cast<Instruction>(
RHS);
1913 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1915 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1916 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1917 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1918 ShallowScoreAtThisLevel))
1919 return ShallowScoreAtThisLevel;
1920 assert(I1 && I2 &&
"Should have early exited.");
1927 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1928 OpIdx1 != NumOperands1; ++OpIdx1) {
1930 int MaxTmpScore = 0;
1931 unsigned MaxOpIdx2 = 0;
1932 bool FoundBest =
false;
1936 ? I2->getNumOperands()
1937 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1938 assert(FromIdx <= ToIdx &&
"Bad index");
1939 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1941 if (Op2Used.
count(OpIdx2))
1946 I1, I2, CurrLevel + 1, {});
1949 TmpScore > MaxTmpScore) {
1950 MaxTmpScore = TmpScore;
1957 Op2Used.
insert(MaxOpIdx2);
1958 ShallowScoreAtThisLevel += MaxTmpScore;
1961 return ShallowScoreAtThisLevel;
1992 struct OperandData {
1993 OperandData() =
default;
1994 OperandData(
Value *V,
bool APO,
bool IsUsed)
1995 : V(V), APO(APO), IsUsed(IsUsed) {}
2005 bool IsUsed =
false;
2014 enum class ReorderingMode {
2028 unsigned ArgSize = 0;
2034 const Loop *L =
nullptr;
2037 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2038 return OpsVec[OpIdx][Lane];
2042 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2043 return OpsVec[OpIdx][Lane];
2048 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2049 OpIdx != NumOperands; ++OpIdx)
2050 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2052 OpsVec[OpIdx][Lane].IsUsed =
false;
2056 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2057 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2069 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2071 Value *IdxLaneV = getData(
Idx, Lane).V;
2072 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2073 isa<ExtractElementInst>(IdxLaneV))
2076 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2079 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2080 if (!isa<Instruction>(OpIdxLnV))
2084 unsigned UniquesCount = Uniques.
size();
2085 auto IdxIt = Uniques.
find(IdxLaneV);
2086 unsigned UniquesCntWithIdxLaneV =
2087 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2088 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2089 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2090 unsigned UniquesCntWithOpIdxLaneV =
2091 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2092 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2094 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2095 UniquesCntWithOpIdxLaneV,
2096 UniquesCntWithOpIdxLaneV -
2098 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2099 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2100 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2109 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2110 Value *IdxLaneV = getData(
Idx, Lane).V;
2111 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2120 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2121 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2123 return R.areAllUsersVectorized(IdxLaneI)
2131 static const int ScoreScaleFactor = 10;
2139 int Lane,
unsigned OpIdx,
unsigned Idx,
2149 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2150 if (Score <= -SplatScore) {
2154 Score += SplatScore;
2160 Score *= ScoreScaleFactor;
2161 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2179 std::optional<unsigned>
2180 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2184 unsigned NumOperands = getNumOperands();
2187 Value *OpLastLane = getData(OpIdx, LastLane).V;
2190 ReorderingMode RMode = ReorderingModes[OpIdx];
2191 if (RMode == ReorderingMode::Failed)
2192 return std::nullopt;
2195 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2201 std::optional<unsigned>
Idx;
2205 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2211 bool IsUsed = RMode == ReorderingMode::Splat ||
2212 RMode == ReorderingMode::Constant ||
2213 RMode == ReorderingMode::Load;
2215 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2217 OperandData &OpData = getData(
Idx, Lane);
2219 bool OpAPO = OpData.APO;
2228 if (OpAPO != OpIdxAPO)
2233 case ReorderingMode::Load:
2234 case ReorderingMode::Opcode: {
2235 bool LeftToRight = Lane > LastLane;
2236 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2237 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2238 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2239 OpIdx,
Idx, IsUsed, UsedLanes);
2240 if (Score >
static_cast<int>(BestOp.Score) ||
2241 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2244 BestOp.Score = Score;
2245 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2249 case ReorderingMode::Constant:
2250 if (isa<Constant>(
Op) ||
2251 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2253 if (isa<Constant>(
Op)) {
2255 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2258 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2262 case ReorderingMode::Splat:
2263 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2264 IsUsed =
Op == OpLastLane;
2265 if (
Op == OpLastLane) {
2267 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2273 case ReorderingMode::Failed:
2279 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2283 return std::nullopt;
2290 unsigned getBestLaneToStartReordering()
const {
2291 unsigned Min = UINT_MAX;
2292 unsigned SameOpNumber = 0;
2303 for (
int I = getNumLanes();
I > 0; --
I) {
2304 unsigned Lane =
I - 1;
2305 OperandsOrderData NumFreeOpsHash =
2306 getMaxNumOperandsThatCanBeReordered(Lane);
2309 if (NumFreeOpsHash.NumOfAPOs < Min) {
2310 Min = NumFreeOpsHash.NumOfAPOs;
2311 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2313 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2314 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2315 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2318 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2319 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2320 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2321 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2322 auto [It, Inserted] =
2323 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2329 unsigned BestLane = 0;
2330 unsigned CntMin = UINT_MAX;
2332 if (
Data.second.first < CntMin) {
2333 CntMin =
Data.second.first;
2334 BestLane =
Data.second.second;
2341 struct OperandsOrderData {
2344 unsigned NumOfAPOs = UINT_MAX;
2347 unsigned NumOpsWithSameOpcodeParent = 0;
2361 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2362 unsigned CntTrue = 0;
2363 unsigned NumOperands = getNumOperands();
2373 bool AllUndefs =
true;
2374 unsigned NumOpsWithSameOpcodeParent = 0;
2378 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2379 const OperandData &OpData = getData(OpIdx, Lane);
2384 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2386 I->getParent() != Parent) {
2387 if (NumOpsWithSameOpcodeParent == 0) {
2388 NumOpsWithSameOpcodeParent = 1;
2390 Parent =
I->getParent();
2392 --NumOpsWithSameOpcodeParent;
2395 ++NumOpsWithSameOpcodeParent;
2399 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2400 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2404 OperandsOrderData
Data;
2405 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2406 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2414 assert((empty() || VL.
size() == getNumLanes()) &&
2415 "Expected same number of lanes");
2418 constexpr unsigned IntrinsicNumOperands = 2;
2420 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2421 OpsVec.
resize(NumOperands);
2422 unsigned NumLanes = VL.
size();
2423 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2424 OpsVec[OpIdx].
resize(NumLanes);
2425 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2426 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2427 "Expected instruction or poison value");
2438 if (isa<PoisonValue>(VL[Lane])) {
2439 OpsVec[OpIdx][Lane] = {
2444 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2445 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2446 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2453 unsigned getNumOperands()
const {
return ArgSize; }
2456 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2459 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2460 return getData(OpIdx, Lane).V;
2464 bool empty()
const {
return OpsVec.
empty(); }
2467 void clear() { OpsVec.
clear(); }
2472 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2473 assert(
Op == getValue(OpIdx, Lane) &&
2474 "Op is expected to be getValue(OpIdx, Lane).");
2476 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2478 bool OpAPO = getData(OpIdx, Lane).APO;
2479 bool IsInvariant = L && L->isLoopInvariant(
Op);
2481 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2485 bool FoundCandidate =
false;
2486 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2487 OperandData &
Data = getData(OpI, Ln);
2488 if (
Data.APO != OpAPO ||
Data.IsUsed)
2490 Value *OpILane = getValue(OpI, Lane);
2491 bool IsConstantOp = isa<Constant>(OpILane);
2500 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2506 isa<Constant>(
Data.V)))) ||
2513 (IsInvariant && !isa<Constant>(
Data.V) &&
2515 L->isLoopInvariant(
Data.V))) {
2516 FoundCandidate =
true;
2523 if (!FoundCandidate)
2526 return getNumLanes() == 2 || Cnt > 1;
2531 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2532 assert(
Op == getValue(OpIdx, Lane) &&
2533 "Op is expected to be getValue(OpIdx, Lane).");
2534 bool OpAPO = getData(OpIdx, Lane).APO;
2535 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2538 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2539 const OperandData &
Data = getData(OpI, Ln);
2540 if (
Data.APO != OpAPO ||
Data.IsUsed)
2542 Value *OpILn = getValue(OpI, Ln);
2543 return (L && L->isLoopInvariant(OpILn)) ||
2555 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2556 L(R.LI->getLoopFor((VL0->
getParent()))) {
2558 appendOperandsOfVL(RootVL, VL0);
2565 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2566 "Expected same num of lanes across all operands");
2567 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2568 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2576 unsigned NumOperands = getNumOperands();
2577 unsigned NumLanes = getNumLanes();
2597 unsigned FirstLane = getBestLaneToStartReordering();
2600 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2601 Value *OpLane0 = getValue(OpIdx, FirstLane);
2604 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2606 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2607 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2608 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2609 else if (isa<LoadInst>(OpILane0))
2610 ReorderingModes[OpIdx] = ReorderingMode::Load;
2612 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2613 }
else if (isa<Constant>(OpLane0)) {
2614 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2615 }
else if (isa<Argument>(OpLane0)) {
2617 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627 auto &&SkipReordering = [
this]() {
2630 for (
const OperandData &
Data : Op0)
2634 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2653 if (SkipReordering())
2656 bool StrategyFailed =
false;
2664 for (
unsigned I = 0;
I < NumOperands; ++
I)
2665 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2668 UsedLanes.
set(FirstLane);
2669 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2672 int Lane = FirstLane +
Direction * Distance;
2673 if (Lane < 0 || Lane >= (
int)NumLanes)
2675 UsedLanes.
set(Lane);
2677 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2680 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2682 std::optional<unsigned> BestIdx =
2683 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2684 MainAltOps[OpIdx], UsedLanes);
2691 swap(OpIdx, *BestIdx, Lane);
2694 StrategyFailed =
true;
2697 if (MainAltOps[OpIdx].
size() != 2) {
2698 OperandData &AltOp = getData(OpIdx, Lane);
2699 InstructionsState OpS =
2701 if (OpS.getOpcode() && OpS.isAltShuffle())
2708 if (!StrategyFailed)
2713#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2716 case ReorderingMode::Load:
2718 case ReorderingMode::Opcode:
2720 case ReorderingMode::Constant:
2722 case ReorderingMode::Splat:
2724 case ReorderingMode::Failed:
2745 const unsigned Indent = 2;
2748 OS <<
"Operand " << Cnt++ <<
"\n";
2749 for (
const OperandData &OpData : OpDataVec) {
2751 if (
Value *V = OpData.V)
2755 OS <<
", APO:" << OpData.APO <<
"}\n";
2777 int BestScore = Limit;
2778 std::optional<int> Index;
2779 for (
int I : seq<int>(0, Candidates.size())) {
2781 Candidates[
I].second,
2784 if (Score > BestScore) {
2799 DeletedInstructions.insert(
I);
2804 template <
typename T>
2807 for (
T *V : DeadVals) {
2808 auto *
I = cast<Instruction>(V);
2809 DeletedInstructions.insert(
I);
2812 for (
T *V : DeadVals) {
2813 if (!V || !Processed.
insert(V).second)
2815 auto *
I = cast<Instruction>(V);
2818 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2819 Entries.push_back(Entry);
2820 auto It = MultiNodeScalars.find(
I);
2821 if (It != MultiNodeScalars.end())
2822 Entries.append(It->second.begin(), It->second.end());
2824 for (
Use &U :
I->operands()) {
2825 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2826 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2828 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2829 return Entry->VectorizedValue == OpI;
2833 I->dropAllReferences();
2835 for (
T *V : DeadVals) {
2836 auto *
I = cast<Instruction>(V);
2837 if (!
I->getParent())
2842 cast<Instruction>(U.getUser()));
2844 "trying to erase instruction with users.");
2845 I->removeFromParent();
2849 while (!DeadInsts.
empty()) {
2852 if (!VI || !VI->getParent())
2855 "Live instruction found in dead worklist!");
2856 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2863 for (
Use &OpU : VI->operands()) {
2864 Value *OpV = OpU.get();
2875 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2876 if (!DeletedInstructions.contains(OpI) &&
2881 VI->removeFromParent();
2882 DeletedInstructions.insert(VI);
2890 return AnalyzedReductionsRoots.count(
I);
2895 AnalyzedReductionsRoots.insert(
I);
2909 AnalyzedReductionsRoots.clear();
2910 AnalyzedReductionVals.
clear();
2911 AnalyzedMinBWVals.
clear();
2923 return NonScheduledFirst.
contains(V);
2936 bool collectValuesToDemote(
2937 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2940 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2950 canReorderOperands(TreeEntry *UserTE,
2957 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2961 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2963 TreeEntry *TE =
nullptr;
2965 TE = getTreeEntry(V);
2966 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2968 auto It = MultiNodeScalars.find(V);
2969 if (It != MultiNodeScalars.end()) {
2970 for (TreeEntry *E : It->second) {
2971 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2979 if (It != VL.
end()) {
2980 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2988 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2989 unsigned OpIdx)
const {
2990 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2991 const_cast<TreeEntry *
>(UserTE), OpIdx);
2995 bool areAllUsersVectorized(
3004 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3009 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3013 getCastContextHint(
const TreeEntry &TE)
const;
3022 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3033 bool ResizeAllowed =
false)
const;
3042 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3043 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3044 unsigned NodeIdx)
const {
3045 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3052 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3057 template <
typename BVTy,
typename ResTy,
typename...
Args>
3058 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3063 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3064 bool PostponedPHIs);
3070 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3077 std::optional<TargetTransformInfo::ShuffleKind>
3089 unsigned NumParts)
const;
3101 std::optional<TargetTransformInfo::ShuffleKind>
3102 isGatherShuffledSingleRegisterEntry(
3119 isGatherShuffledEntry(
3122 unsigned NumParts,
bool ForOrder =
false);
3128 Type *ScalarTy)
const;
3132 void setInsertPointAfterBundle(
const TreeEntry *E);
3142 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3147 void tryToVectorizeGatheredLoads(
3156 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3172 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3176 void reorderGatherNode(TreeEntry &TE);
3180 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3197 [Scalars](
Value *V,
int Idx) {
3198 return (isa<UndefValue>(V) &&
3199 Idx == PoisonMaskElem) ||
3200 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3203 if (!ReorderIndices.empty()) {
3210 return IsSame(Scalars, Mask);
3211 if (VL.
size() == ReuseShuffleIndices.size()) {
3213 return IsSame(Scalars, Mask);
3217 return IsSame(Scalars, ReuseShuffleIndices);
3220 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3221 return isGather() && !UserTreeIndices.empty() &&
3222 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3223 UserTreeIndices.front().UserTE == UserEI.UserTE;
3227 bool hasEqualOperands(
const TreeEntry &TE)
const {
3228 if (
TE.getNumOperands() != getNumOperands())
3231 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3232 unsigned PrevCount =
Used.count();
3233 for (
unsigned K = 0;
K < E; ++
K) {
3236 if (getOperand(K) ==
TE.getOperand(
I)) {
3242 if (PrevCount ==
Used.count())
3251 unsigned getVectorFactor()
const {
3252 if (!ReuseShuffleIndices.empty())
3253 return ReuseShuffleIndices.size();
3254 return Scalars.
size();
3258 bool isGather()
const {
return State == NeedToGather; }
3285 enum CombinedOpcode {
3287 MinMax = Instruction::OtherOpsEnd + 1,
3289 CombinedOpcode CombinedOp = NotCombinedOp;
3303 VecTreeTy &Container;
3327 unsigned InterleaveFactor = 0;
3331 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3333 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3339 assert(Operands[OpIdx].empty() &&
"Already resized?");
3341 "Number of operands is greater than the number of scalars.");
3347 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3348 VLOperands Ops(Scalars, MainOp, R);
3352 setOperand(
I, Ops.getVL(
I));
3374 unsigned getNumOperands()
const {
return Operands.size(); }
3377 Value *getSingleOperand(
unsigned OpIdx)
const {
3379 assert(!Operands[OpIdx].empty() &&
"No operand available");
3384 bool isAltShuffle()
const {
return MainOp != AltOp; }
3387 unsigned CheckedOpcode =
I->getOpcode();
3388 return (getOpcode() == CheckedOpcode ||
3389 getAltOpcode() == CheckedOpcode);
3396 auto *
I = dyn_cast<Instruction>(
Op);
3397 if (
I && isOpcodeOrAlt(
I))
3402 void setOperations(
const InstructionsState &S) {
3403 MainOp = S.getMainOp();
3404 AltOp = S.getAltOp();
3416 unsigned getOpcode()
const {
3417 return MainOp ? MainOp->
getOpcode() : 0;
3420 unsigned getAltOpcode()
const {
3426 int findLaneForValue(
Value *V)
const {
3427 unsigned FoundLane = getVectorFactor();
3428 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3429 std::advance(It, 1)) {
3432 FoundLane = std::distance(Scalars.begin(), It);
3433 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3434 if (!ReorderIndices.
empty())
3435 FoundLane = ReorderIndices[FoundLane];
3436 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3437 if (ReuseShuffleIndices.
empty())
3439 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3440 RIt != ReuseShuffleIndices.
end()) {
3441 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3445 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3458 bool isNonPowOf2Vec()
const {
3460 return IsNonPowerOf2;
3469 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3470 "Reshuffling not supported with non-power-of-2 vectors yet.");
3471 return IsNonPowerOf2;
3474 Value *getOrdered(
unsigned Idx)
const {
3475 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3476 if (ReorderIndices.
empty())
3477 return Scalars[
Idx];
3487 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3488 dbgs() <<
"Operand " << OpI <<
":\n";
3489 for (
const Value *V : Operands[OpI])
3492 dbgs() <<
"Scalars: \n";
3493 for (
Value *V : Scalars)
3495 dbgs() <<
"State: ";
3498 if (InterleaveFactor > 0) {
3499 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3502 dbgs() <<
"Vectorize\n";
3505 case ScatterVectorize:
3506 dbgs() <<
"ScatterVectorize\n";
3508 case StridedVectorize:
3509 dbgs() <<
"StridedVectorize\n";
3512 dbgs() <<
"NeedToGather\n";
3514 case CombinedVectorize:
3515 dbgs() <<
"CombinedVectorize\n";
3518 dbgs() <<
"MainOp: ";
3520 dbgs() << *MainOp <<
"\n";
3523 dbgs() <<
"AltOp: ";
3525 dbgs() << *AltOp <<
"\n";
3528 dbgs() <<
"VectorizedValue: ";
3529 if (VectorizedValue)
3530 dbgs() << *VectorizedValue <<
"\n";
3533 dbgs() <<
"ReuseShuffleIndices: ";
3534 if (ReuseShuffleIndices.
empty())
3537 for (
int ReuseIdx : ReuseShuffleIndices)
3538 dbgs() << ReuseIdx <<
", ";
3540 dbgs() <<
"ReorderIndices: ";
3541 for (
unsigned ReorderIdx : ReorderIndices)
3542 dbgs() << ReorderIdx <<
", ";
3544 dbgs() <<
"UserTreeIndices: ";
3545 for (
const auto &EInfo : UserTreeIndices)
3546 dbgs() << EInfo <<
", ";
3553 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3556 dbgs() <<
"SLP: " << Banner <<
":\n";
3558 dbgs() <<
"SLP: Costs:\n";
3559 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3560 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3561 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3562 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3563 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3569 std::optional<ScheduleData *> Bundle,
3570 const InstructionsState &S,
3571 const EdgeInfo &UserTreeIdx,
3574 unsigned InterleaveFactor = 0) {
3575 TreeEntry::EntryState EntryState =
3576 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3577 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3578 ReuseShuffleIndices, ReorderIndices);
3579 if (E && InterleaveFactor > 0)
3580 E->setInterleave(InterleaveFactor);
3585 TreeEntry::EntryState EntryState,
3586 std::optional<ScheduleData *> Bundle,
3587 const InstructionsState &S,
3588 const EdgeInfo &UserTreeIdx,
3591 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3592 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3593 "Need to vectorize gather entry?");
3595 if (GatheredLoadsEntriesFirst.has_value() &&
3596 EntryState == TreeEntry::NeedToGather &&
3597 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3598 !UserTreeIdx.UserTE)
3600 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3601 TreeEntry *
Last = VectorizableTree.
back().get();
3602 Last->Idx = VectorizableTree.
size() - 1;
3603 Last->State = EntryState;
3608 ReuseShuffleIndices.empty()) &&
3609 "Reshuffling scalars not yet supported for nodes with padding");
3610 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3611 ReuseShuffleIndices.end());
3612 if (ReorderIndices.
empty()) {
3614 Last->setOperations(S);
3617 Last->Scalars.assign(VL.
size(),
nullptr);
3620 if (Idx >= VL.size())
3621 return UndefValue::get(VL.front()->getType());
3625 Last->setOperations(S);
3626 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3628 if (!
Last->isGather()) {
3629 for (
Value *V : VL) {
3630 const TreeEntry *
TE = getTreeEntry(V);
3632 "Scalar already in tree!");
3635 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3638 ScalarToTreeEntry[
V] =
Last;
3641 ScheduleData *BundleMember = *Bundle;
3642 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3645 "Bundle and VL out of sync");
3647 for (
Value *V : VL) {
3652 BundleMember->TE =
Last;
3653 BundleMember = BundleMember->NextInBundle;
3656 assert(!BundleMember &&
"Bundle and VL out of sync");
3659 bool AllConstsOrCasts =
true;
3662 auto *
I = dyn_cast<CastInst>(V);
3663 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3664 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3665 !UserTreeIdx.UserTE->isGather())
3668 if (AllConstsOrCasts)
3670 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3671 MustGather.
insert(VL.begin(), VL.end());
3674 if (UserTreeIdx.UserTE)
3675 Last->UserTreeIndices.push_back(UserTreeIdx);
3681 TreeEntry::VecTreeTy VectorizableTree;
3686 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3687 VectorizableTree[
Id]->dump();
3693 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3695 const TreeEntry *getTreeEntry(
Value *V)
const {
3696 return ScalarToTreeEntry.lookup(V);
3705 bool areAltOperandsProfitable(
const InstructionsState &S,
3710 TreeEntry::EntryState
3712 bool IsScatterVectorizeUserTE,
3745 using ValueToGatherNodesMap =
3747 ValueToGatherNodesMap ValueToGatherNodes;
3755 bool IsGraphTransformMode =
false;
3758 std::optional<unsigned> GatheredLoadsEntriesFirst;
3761 struct ExternalUser {
3785 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3786 auto It = AliasCache.
find(Key);
3787 if (It != AliasCache.
end())
3792 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3796 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3828 UserList ExternalUses;
3851 struct ScheduleData {
3854 enum { InvalidDeps = -1 };
3856 ScheduleData() =
default;
3859 FirstInBundle =
this;
3860 NextInBundle =
nullptr;
3861 NextLoadStore =
nullptr;
3862 IsScheduled =
false;
3863 SchedulingRegionID = BlockSchedulingRegionID;
3864 clearDependencies();
3871 if (hasValidDependencies()) {
3872 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3874 assert(UnscheduledDeps == Dependencies &&
"invariant");
3878 assert(isSchedulingEntity() &&
3879 "unexpected scheduled state");
3880 for (
const ScheduleData *BundleMember =
this; BundleMember;
3881 BundleMember = BundleMember->NextInBundle) {
3882 assert(BundleMember->hasValidDependencies() &&
3883 BundleMember->UnscheduledDeps == 0 &&
3884 "unexpected scheduled state");
3885 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3886 "only bundle is marked scheduled");
3890 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3891 "all bundle members must be in same basic block");
3897 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3901 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3905 bool isPartOfBundle()
const {
3906 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3911 bool isReady()
const {
3912 assert(isSchedulingEntity() &&
3913 "can't consider non-scheduling entity for ready list");
3914 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3920 int incrementUnscheduledDeps(
int Incr) {
3921 assert(hasValidDependencies() &&
3922 "increment of unscheduled deps would be meaningless");
3923 UnscheduledDeps += Incr;
3924 return FirstInBundle->unscheduledDepsInBundle();
3929 void resetUnscheduledDeps() {
3930 UnscheduledDeps = Dependencies;
3934 void clearDependencies() {
3935 Dependencies = InvalidDeps;
3936 resetUnscheduledDeps();
3937 MemoryDependencies.clear();
3938 ControlDependencies.clear();
3941 int unscheduledDepsInBundle()
const {
3942 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3944 for (
const ScheduleData *BundleMember =
this; BundleMember;
3945 BundleMember = BundleMember->NextInBundle) {
3946 if (BundleMember->UnscheduledDeps == InvalidDeps)
3948 Sum += BundleMember->UnscheduledDeps;
3954 if (!isSchedulingEntity()) {
3955 os <<
"/ " << *Inst;
3956 }
else if (NextInBundle) {
3958 ScheduleData *SD = NextInBundle;
3960 os <<
';' << *SD->Inst;
3961 SD = SD->NextInBundle;
3972 TreeEntry *
TE =
nullptr;
3976 ScheduleData *FirstInBundle =
nullptr;
3980 ScheduleData *NextInBundle =
nullptr;
3984 ScheduleData *NextLoadStore =
nullptr;
3998 int SchedulingRegionID = 0;
4001 int SchedulingPriority = 0;
4007 int Dependencies = InvalidDeps;
4013 int UnscheduledDeps = InvalidDeps;
4017 bool IsScheduled =
false;
4022 const BoUpSLP::ScheduleData &SD) {
4047 struct BlockScheduling {
4049 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4053 ScheduleStart =
nullptr;
4054 ScheduleEnd =
nullptr;
4055 FirstLoadStoreInRegion =
nullptr;
4056 LastLoadStoreInRegion =
nullptr;
4057 RegionHasStackSave =
false;
4061 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4064 ScheduleRegionSize = 0;
4068 ++SchedulingRegionID;
4072 if (BB !=
I->getParent())
4075 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4076 if (SD && isInSchedulingRegion(SD))
4081 ScheduleData *getScheduleData(
Value *V) {
4082 if (
auto *
I = dyn_cast<Instruction>(V))
4083 return getScheduleData(
I);
4087 bool isInSchedulingRegion(ScheduleData *SD)
const {
4088 return SD->SchedulingRegionID == SchedulingRegionID;
4093 template <
typename ReadyListType>
4094 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4095 SD->IsScheduled =
true;
4098 for (ScheduleData *BundleMember = SD; BundleMember;
4099 BundleMember = BundleMember->NextInBundle) {
4104 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4105 ScheduleData *OpDef = getScheduleData(
I);
4106 if (OpDef && OpDef->hasValidDependencies() &&
4107 OpDef->incrementUnscheduledDeps(-1) == 0) {
4111 ScheduleData *DepBundle = OpDef->FirstInBundle;
4112 assert(!DepBundle->IsScheduled &&
4113 "already scheduled bundle gets ready");
4114 ReadyList.insert(DepBundle);
4116 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4123 if (TreeEntry *TE = BundleMember->TE) {
4125 int Lane = std::distance(
TE->Scalars.begin(),
4126 find(
TE->Scalars, BundleMember->Inst));
4127 assert(Lane >= 0 &&
"Lane not set");
4135 auto *
In = BundleMember->Inst;
4138 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4139 In->getNumOperands() ==
TE->getNumOperands()) &&
4140 "Missed TreeEntry operands?");
4143 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4144 OpIdx != NumOperands; ++OpIdx)
4145 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4150 for (
Use &U : BundleMember->Inst->operands())
4151 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4155 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4156 if (MemoryDepSD->hasValidDependencies() &&
4157 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4160 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4161 assert(!DepBundle->IsScheduled &&
4162 "already scheduled bundle gets ready");
4163 ReadyList.insert(DepBundle);
4165 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4169 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4170 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4173 ScheduleData *DepBundle = DepSD->FirstInBundle;
4174 assert(!DepBundle->IsScheduled &&
4175 "already scheduled bundle gets ready");
4176 ReadyList.insert(DepBundle);
4178 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4189 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4190 ScheduleStart->comesBefore(ScheduleEnd) &&
4191 "Not a valid scheduling region?");
4193 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4194 auto *SD = getScheduleData(
I);
4197 assert(isInSchedulingRegion(SD) &&
4198 "primary schedule data not in window?");
4199 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4200 "entire bundle in window!");
4204 for (
auto *SD : ReadyInsts) {
4205 assert(SD->isSchedulingEntity() && SD->isReady() &&
4206 "item in ready list not ready?");
4212 template <
typename ReadyListType>
4213 void initialFillReadyList(ReadyListType &ReadyList) {
4214 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4215 ScheduleData *SD = getScheduleData(
I);
4216 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4218 ReadyList.insert(SD);
4220 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4234 std::optional<ScheduleData *>
4236 const InstructionsState &S);
4242 ScheduleData *allocateScheduleDataChunks();
4246 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4251 ScheduleData *PrevLoadStore,
4252 ScheduleData *NextLoadStore);
4256 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4260 void resetSchedule();
4290 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4294 ScheduleData *LastLoadStoreInRegion =
nullptr;
4299 bool RegionHasStackSave =
false;
4302 int ScheduleRegionSize = 0;
4311 int SchedulingRegionID = 1;
4319 void scheduleBlock(BlockScheduling *BS);
4326 struct OrdersTypeDenseMapInfo {
4339 static unsigned getHashValue(
const OrdersType &V) {
4360 unsigned MaxVecRegSize;
4361 unsigned MinVecRegSize;
4376 unsigned ReductionBitWidth = 0;
4379 unsigned BaseGraphSize = 1;
4383 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4402 struct ChildIteratorType
4404 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4415 return R.VectorizableTree[0].get();
4419 return {
N->UserTreeIndices.begin(),
N->Container};
4423 return {
N->UserTreeIndices.end(),
N->Container};
4428 class nodes_iterator {
4439 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4443 return nodes_iterator(R->VectorizableTree.begin());
4447 return nodes_iterator(R->VectorizableTree.end());
4450 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4461 OS << Entry->Idx <<
".\n";
4464 for (
auto *V : Entry->Scalars) {
4466 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4467 return EU.Scalar == V;
4477 if (Entry->isGather())
4479 if (Entry->State == TreeEntry::ScatterVectorize ||
4480 Entry->State == TreeEntry::StridedVectorize)
4481 return "color=blue";
4490 for (
auto *
I : DeletedInstructions) {
4491 if (!
I->getParent()) {
4494 if (isa<PHINode>(
I))
4496 I->insertBefore(
F->getEntryBlock(),
4497 F->getEntryBlock().getFirstNonPHIIt());
4499 I->insertBefore(
F->getEntryBlock().getTerminator());
4502 for (
Use &U :
I->operands()) {
4503 auto *
Op = dyn_cast<Instruction>(U.get());
4504 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4508 I->dropAllReferences();
4510 for (
auto *
I : DeletedInstructions) {
4512 "trying to erase instruction with users.");
4513 I->eraseFromParent();
4519#ifdef EXPENSIVE_CHECKS
4530 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4531 "Expected non-empty mask.");
4534 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4536 Reuses[Mask[
I]] = Prev[
I];
4544 bool BottomOrder =
false) {
4545 assert(!Mask.empty() &&
"Expected non-empty mask.");
4546 unsigned Sz = Mask.size();
4549 if (Order.
empty()) {
4551 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4553 PrevOrder.
swap(Order);
4556 for (
unsigned I = 0;
I < Sz; ++
I)
4558 Order[
I] = PrevOrder[Mask[
I]];
4560 return Data.value() == Sz ||
Data.index() ==
Data.value();
4569 if (Order.
empty()) {
4571 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4581 for (
unsigned I = 0;
I < Sz; ++
I)
4583 Order[MaskOrder[
I]] =
I;
4587std::optional<BoUpSLP::OrdersType>
4589 assert(TE.isGather() &&
"Expected gather node only.");
4593 Type *ScalarTy = GatheredScalars.
front()->getType();
4594 int NumScalars = GatheredScalars.
size();
4596 return std::nullopt;
4599 if (NumParts == 0 || NumParts >= NumScalars ||
4600 VecTy->getNumElements() % NumParts != 0 ||
4602 VecTy->getNumElements() / NumParts))
4608 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4610 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4613 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4614 return std::nullopt;
4615 OrdersType CurrentOrder(NumScalars, NumScalars);
4616 if (GatherShuffles.
size() == 1 &&
4618 Entries.front().front()->isSame(TE.Scalars)) {
4621 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4622 return CurrentOrder;
4626 return all_of(Mask, [&](
int I) {
4633 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4634 (Entries.size() != 1 ||
4635 Entries.front().front()->ReorderIndices.empty())) ||
4636 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4637 return std::nullopt;
4642 for (
int I : seq<int>(0, NumParts)) {
4643 if (ShuffledSubMasks.
test(
I))
4645 const int VF = GetVF(
I);
4651 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4652 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4653 ShuffledSubMasks.
set(
I);
4657 int FirstMin = INT_MAX;
4658 int SecondVecFound =
false;
4659 for (
int K : seq<int>(Limit)) {
4660 int Idx = Mask[
I * PartSz + K];
4662 Value *V = GatheredScalars[
I * PartSz + K];
4664 SecondVecFound =
true;
4673 SecondVecFound =
true;
4677 FirstMin = (FirstMin / PartSz) * PartSz;
4679 if (SecondVecFound) {
4680 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4681 ShuffledSubMasks.
set(
I);
4684 for (
int K : seq<int>(Limit)) {
4685 int Idx = Mask[
I * PartSz + K];
4689 if (
Idx >= PartSz) {
4690 SecondVecFound =
true;
4693 if (CurrentOrder[
I * PartSz +
Idx] >
4694 static_cast<unsigned>(
I * PartSz + K) &&
4695 CurrentOrder[
I * PartSz +
Idx] !=
4696 static_cast<unsigned>(
I * PartSz +
Idx))
4697 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4700 if (SecondVecFound) {
4701 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4702 ShuffledSubMasks.
set(
I);
4708 if (!ExtractShuffles.
empty())
4709 TransformMaskToOrder(
4710 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4711 if (!ExtractShuffles[
I])
4714 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4715 for (
unsigned Idx : seq<unsigned>(Sz)) {
4716 int K =
I * PartSz +
Idx;
4719 if (!TE.ReuseShuffleIndices.empty())
4720 K = TE.ReuseShuffleIndices[K];
4723 if (!TE.ReorderIndices.empty())
4724 K = std::distance(TE.ReorderIndices.begin(),
4725 find(TE.ReorderIndices, K));
4726 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4729 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4731 .getKnownMinValue());
4736 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4737 if (ShuffledSubMasks.
any())
4738 return std::nullopt;
4739 PartSz = NumScalars;
4742 if (!Entries.empty())
4743 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4744 if (!GatherShuffles[
I])
4746 return std::max(Entries[
I].front()->getVectorFactor(),
4747 Entries[
I].back()->getVectorFactor());
4750 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4751 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4752 return std::nullopt;
4753 return std::move(CurrentOrder);
4758 bool CompareOpcodes =
true) {
4762 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4763 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4764 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4765 (!GEP2 || GEP2->getNumOperands() == 2) &&
4766 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4767 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4770 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4775template <
typename T>
4777 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4779 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4780 return CommonAlignment;
4786 "Order is empty. Please check it before using isReverseOrder.");
4787 unsigned Sz = Order.
size();
4789 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4800static std::optional<Value *>
4806 const SCEV *PtrSCEVLowest =
nullptr;
4807 const SCEV *PtrSCEVHighest =
nullptr;
4813 return std::nullopt;
4815 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4816 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4820 if (isa<SCEVCouldNotCompute>(Diff))
4821 return std::nullopt;
4823 PtrSCEVLowest = PtrSCEV;
4827 if (isa<SCEVCouldNotCompute>(Diff1))
4828 return std::nullopt;
4830 PtrSCEVHighest = PtrSCEV;
4836 if (isa<SCEVCouldNotCompute>(Dist))
4837 return std::nullopt;
4838 int Size =
DL.getTypeStoreSize(ElemTy);
4839 auto TryGetStride = [&](
const SCEV *Dist,
4840 const SCEV *Multiplier) ->
const SCEV * {
4841 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4842 if (M->getOperand(0) == Multiplier)
4843 return M->getOperand(1);
4844 if (M->getOperand(1) == Multiplier)
4845 return M->getOperand(0);
4848 if (Multiplier == Dist)
4853 const SCEV *Stride =
nullptr;
4854 if (
Size != 1 || SCEVs.
size() > 2) {
4856 Stride = TryGetStride(Dist, Sz);
4858 return std::nullopt;
4860 if (!Stride || isa<SCEVConstant>(Stride))
4861 return std::nullopt;
4864 using DistOrdPair = std::pair<int64_t, int>;
4866 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4868 bool IsConsecutive =
true;
4869 for (
const SCEV *PtrSCEV : SCEVs) {
4871 if (PtrSCEV != PtrSCEVLowest) {
4873 const SCEV *Coeff = TryGetStride(Diff, Stride);
4875 return std::nullopt;
4876 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4877 if (!SC || isa<SCEVCouldNotCompute>(SC))
4878 return std::nullopt;
4882 return std::nullopt;
4883 Dist = SC->getAPInt().getZExtValue();
4887 return std::nullopt;
4888 auto Res = Offsets.emplace(Dist, Cnt);
4890 return std::nullopt;
4892 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4895 if (Offsets.size() != SCEVs.
size())
4896 return std::nullopt;
4897 SortedIndices.
clear();
4898 if (!IsConsecutive) {
4902 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4903 SortedIndices[Cnt] = Pair.second;
4913static std::pair<InstructionCost, InstructionCost>
4929 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4932 Mask, NumSrcElts, NumSubElts,
Index)) {
4933 if (
Index + NumSubElts > NumSrcElts &&
4934 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4947 unsigned *BestVF,
bool TryRecursiveCheck)
const {
4960 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4966 const unsigned Sz = VL.
size();
4968 auto *POIter = PointerOps.
begin();
4969 for (
Value *V : VL) {
4970 auto *L = dyn_cast<LoadInst>(V);
4971 if (!L || !L->isSimple())
4973 *POIter = L->getPointerOperand();
4982 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5002 if (Order.
empty()) {
5003 Ptr0 = PointerOps.
front();
5004 PtrN = PointerOps.
back();