73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Enable vectorization for wider vector utilization"));
122 cl::desc(
"Only vectorize if you gain more than this "
127 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
132 cl::desc(
"Attempt to vectorize horizontal reductions"));
137 "Attempt to vectorize horizontal reductions feeding into a store"));
143 cl::desc(
"Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
148 cl::desc(
"Attempt to vectorize for this register size in bits"));
152 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
160 cl::desc(
"Limit the size of the SLP scheduling region per block"));
164 cl::desc(
"Attempt to vectorize for this register size in bits"));
168 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
172 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
178 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
187 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
191 cl::desc(
"The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
196 cl::desc(
"The maximum stride, considered to be profitable."));
200 cl::desc(
"Display the SLP trees with Graphviz"));
204 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
235 if (
SLPReVec && isa<FixedVectorType>(Ty))
237 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
246 return VecTy->getNumElements();
263 for (
unsigned I : seq<unsigned>(Mask.size()))
265 I * VecTyNumElements, VecTyNumElements)))
267 : Mask[
I] * VecTyNumElements + J;
299 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
301 auto *SV = cast<ShuffleVectorInst>(VL.
front());
302 unsigned SVNumElements =
303 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
304 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
305 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
307 unsigned NumGroup = 0;
308 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
309 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
310 Value *Src = SV->getOperand(0);
314 auto *SV = cast<ShuffleVectorInst>(V);
316 if (SV->getOperand(0) != Src)
319 if (!SV->isExtractSubvectorMask(
Index))
321 for (
int I : seq<int>(
Index,
Index + SV->getShuffleMask().size()))
330 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
348 auto *SV = cast<ShuffleVectorInst>(VL.
front());
349 unsigned SVNumElements =
350 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
352 unsigned AccumulateLength = 0;
353 for (
Value *V : VL) {
354 auto *SV = cast<ShuffleVectorInst>(V);
355 for (
int M : SV->getShuffleMask())
357 : AccumulateLength + M);
358 AccumulateLength += SVNumElements;
366 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
373 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
374 !isa<ExtractValueInst, UndefValue>(V))
376 auto *
I = dyn_cast<Instruction>(V);
377 if (!
I || isa<ExtractValueInst>(
I))
379 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
381 if (isa<ExtractElementInst>(
I))
383 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
399 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
408 OS <<
"Idx: " <<
Idx <<
", ";
409 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
425 for (
int I = 1, E = VL.
size();
I < E;
I++) {
426 auto *
II = dyn_cast<Instruction>(VL[
I]);
430 if (BB !=
II->getParent())
447 Value *FirstNonUndef =
nullptr;
448 for (
Value *V : VL) {
449 if (isa<UndefValue>(V))
451 if (!FirstNonUndef) {
455 if (V != FirstNonUndef)
458 return FirstNonUndef !=
nullptr;
463 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
464 return Cmp->isCommutative();
465 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
466 return BO->isCommutative() ||
467 (BO->getOpcode() == Instruction::Sub &&
473 ICmpInst::Predicate Pred;
474 if (match(U.getUser(),
475 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
476 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
480 return match(U.getUser(),
481 m_Intrinsic<Intrinsic::abs>(
482 m_Specific(U.get()), m_ConstantInt(Flag))) &&
483 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
486 (BO->getOpcode() == Instruction::FSub &&
489 return match(U.getUser(),
490 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
492 return I->isCommutative();
498 static_assert(std::is_same_v<T, InsertElementInst> ||
499 std::is_same_v<T, ExtractElementInst>,
502 if (
const auto *IE = dyn_cast<T>(Inst)) {
503 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
506 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
509 if (CI->getValue().uge(VT->getNumElements()))
511 Index *= VT->getNumElements();
512 Index += CI->getZExtValue();
523 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
525 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
530 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
534 Type *CurrentType =
IV->getType();
535 for (
unsigned I :
IV->indices()) {
536 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
537 Index *= ST->getNumElements();
538 CurrentType = ST->getElementType(
I);
539 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
540 Index *= AT->getNumElements();
541 CurrentType = AT->getElementType();
574 if (MaskArg == UseMask::UndefsAsMask)
578 if (MaskArg == UseMask::FirstArg &&
Value < VF)
579 UseMask.reset(
Value);
580 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
581 UseMask.reset(
Value - VF);
589template <
bool IsPoisonOnly = false>
593 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
596 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
599 auto *
C = dyn_cast<Constant>(V);
601 if (!UseMask.empty()) {
603 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
605 if (isa<T>(
II->getOperand(1)))
612 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
620 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
627 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
628 if (
Constant *Elem =
C->getAggregateElement(
I))
630 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
658static std::optional<TargetTransformInfo::ShuffleKind>
660 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
664 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
665 auto *EI = dyn_cast<ExtractElementInst>(V);
668 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
671 return std::max(S, VTy->getNumElements());
674 Value *Vec1 =
nullptr;
675 Value *Vec2 =
nullptr;
677 auto *EE = dyn_cast<ExtractElementInst>(V);
680 Value *Vec = EE->getVectorOperand();
681 if (isa<UndefValue>(Vec))
686 ShuffleMode CommonShuffleMode =
Unknown;
688 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
690 if (isa<UndefValue>(VL[
I]))
692 auto *EI = cast<ExtractElementInst>(VL[
I]);
693 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
695 auto *Vec = EI->getVectorOperand();
697 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
700 if (isa<UndefValue>(Vec)) {
703 if (isa<UndefValue>(EI->getIndexOperand()))
705 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
711 unsigned IntIdx =
Idx->getValue().getZExtValue();
718 if (!Vec1 || Vec1 == Vec) {
720 }
else if (!Vec2 || Vec2 == Vec) {
726 if (CommonShuffleMode == Permute)
730 if (Mask[
I] %
Size !=
I) {
731 CommonShuffleMode = Permute;
734 CommonShuffleMode =
Select;
737 if (CommonShuffleMode ==
Select && Vec2)
748 assert((Opcode == Instruction::ExtractElement ||
749 Opcode == Instruction::ExtractValue) &&
750 "Expected extractelement or extractvalue instruction.");
751 if (Opcode == Instruction::ExtractElement) {
752 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
755 return CI->getZExtValue();
757 auto *EI = cast<ExtractValueInst>(E);
758 if (EI->getNumIndices() != 1)
760 return *EI->idx_begin();
766struct InstructionsState {
768 Value *OpValue =
nullptr;
779 unsigned getAltOpcode()
const {
784 bool isAltShuffle()
const {
return AltOp != MainOp; }
787 unsigned CheckedOpcode =
I->getOpcode();
788 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
791 InstructionsState() =
delete;
793 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
812 unsigned BaseIndex = 0);
820 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
821 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
822 BaseOp0 == Op0 || BaseOp1 == Op1 ||
833 "Assessing comparisons of different types?");
843 return (BasePred == Pred &&
845 (BasePred == SwappedPred &&
854 unsigned BaseIndex) {
857 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
859 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
860 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
861 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
863 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
865 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
866 unsigned AltOpcode = Opcode;
867 unsigned AltIndex = BaseIndex;
869 bool SwappedPredsCompatible = [&]() {
873 UniquePreds.
insert(BasePred);
874 UniqueNonSwappedPreds.
insert(BasePred);
875 for (
Value *V : VL) {
876 auto *
I = dyn_cast<CmpInst>(V);
882 UniqueNonSwappedPreds.
insert(CurrentPred);
883 if (!UniquePreds.
contains(CurrentPred) &&
884 !UniquePreds.
contains(SwappedCurrentPred))
885 UniquePreds.
insert(CurrentPred);
890 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
894 auto *IBase = cast<Instruction>(VL[BaseIndex]);
897 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
901 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
903 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
904 auto *
I = cast<Instruction>(VL[Cnt]);
905 unsigned InstOpcode =
I->getOpcode();
906 if (IsBinOp && isa<BinaryOperator>(
I)) {
907 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
911 AltOpcode = InstOpcode;
915 }
else if (IsCastOp && isa<CastInst>(
I)) {
916 Value *Op0 = IBase->getOperand(0);
918 Value *Op1 =
I->getOperand(0);
921 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
923 if (Opcode == AltOpcode) {
926 "Cast isn't safe for alternation, logic needs to be updated!");
927 AltOpcode = InstOpcode;
932 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
933 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
934 Type *Ty0 = BaseInst->getOperand(0)->getType();
935 Type *Ty1 = Inst->getOperand(0)->getType();
937 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
944 if ((E == 2 || SwappedPredsCompatible) &&
945 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
950 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
951 if (AltIndex != BaseIndex) {
954 }
else if (BasePred != CurrentPred) {
957 "CmpInst isn't safe for alternation, logic needs to be updated!");
962 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
963 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
966 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
967 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
968 if (Gep->getNumOperands() != 2 ||
969 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
970 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
971 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
973 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
974 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
975 auto *BaseLI = cast<LoadInst>(IBase);
976 if (!LI->isSimple() || !BaseLI->isSimple())
977 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
978 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
979 auto *
CallBase = cast<CallInst>(IBase);
981 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
983 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
984 Call->op_begin() + Call->getBundleOperandsEndIndex(),
987 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
990 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
993 if (Mappings.
size() != BaseMappings.
size() ||
994 Mappings.
front().ISA != BaseMappings.
front().ISA ||
995 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
996 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
997 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
998 Mappings.
front().Shape.Parameters !=
999 BaseMappings.
front().Shape.Parameters)
1000 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1005 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1008 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
1009 cast<Instruction>(VL[AltIndex]));
1023 unsigned Opcode = UserInst->
getOpcode();
1025 case Instruction::Load: {
1026 LoadInst *LI = cast<LoadInst>(UserInst);
1029 case Instruction::Store: {
1030 StoreInst *SI = cast<StoreInst>(UserInst);
1031 return (SI->getPointerOperand() == Scalar);
1033 case Instruction::Call: {
1034 CallInst *CI = cast<CallInst>(UserInst);
1037 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
1038 Arg.value().get() == Scalar;
1050 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1057 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1058 return LI->isSimple();
1060 return SI->isSimple();
1062 return !
MI->isVolatile();
1070 bool ExtendingManyInputs =
false) {
1071 if (SubMask.
empty())
1074 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1076 (SubMask.
size() == Mask.size() &&
1077 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
1078 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
1079 "SubMask with many inputs support must be larger than the mask.");
1081 Mask.append(SubMask.
begin(), SubMask.
end());
1085 int TermValue = std::min(Mask.size(), SubMask.
size());
1086 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1088 (!ExtendingManyInputs &&
1089 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1091 NewMask[
I] = Mask[SubMask[
I]];
1107 const unsigned Sz = Order.
size();
1110 for (
unsigned I = 0;
I < Sz; ++
I) {
1112 UnusedIndices.
reset(Order[
I]);
1114 MaskedIndices.
set(
I);
1116 if (MaskedIndices.
none())
1119 "Non-synced masked/available indices.");
1123 assert(
Idx >= 0 &&
"Indices must be synced.");
1134 Type *ScalarTy = VL[0]->getType();
1137 for (
unsigned Lane : seq<unsigned>(VL.
size()))
1138 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1139 OpcodeMask.
set(Lane * ScalarTyNumElements,
1140 Lane * ScalarTyNumElements + ScalarTyNumElements);
1149 const unsigned E = Indices.
size();
1151 for (
unsigned I = 0;
I < E; ++
I)
1152 Mask[Indices[
I]] =
I;
1158 assert(!Mask.empty() &&
"Expected non-empty mask.");
1162 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1164 Scalars[Mask[
I]] = Prev[
I];
1172 auto *
I = dyn_cast<Instruction>(V);
1177 auto *IO = dyn_cast<Instruction>(V);
1180 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1189 auto *
I = dyn_cast<Instruction>(V);
1193 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1195 auto *IU = dyn_cast<Instruction>(U);
1198 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1214 return !VL.
empty() &&
1218namespace slpvectorizer {
1223 struct ScheduleData;
1248 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1249 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1303 return !VectorizableTree.
empty() &&
1304 !VectorizableTree.
front()->UserTreeIndices.empty();
1309 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1310 return VectorizableTree.
front()->Scalars;
1316 return MinBWs.
at(VectorizableTree.
front().get()).second;
1331 VectorizableTree.
clear();
1332 ScalarToTreeEntry.clear();
1333 MultiNodeScalars.clear();
1335 NonScheduledFirst.
clear();
1336 EntryToLastInstruction.clear();
1337 ExternalUses.
clear();
1338 ExternalUsesAsOriginalScalar.clear();
1339 for (
auto &Iter : BlocksSchedules) {
1340 BlockScheduling *BS = Iter.second.get();
1344 ReductionBitWidth = 0;
1345 CastMaxMinBWSizes.reset();
1346 ExtraBitWidthNodes.
clear();
1347 InstrElementSize.clear();
1348 UserIgnoreList =
nullptr;
1349 PostponedGathers.
clear();
1350 ValueToGatherNodes.
clear();
1407 return MaxVecRegSize;
1412 return MinVecRegSize;
1420 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1422 return MaxVF ? MaxVF : UINT_MAX;
1466 bool TryRecursiveCheck =
true)
const;
1490 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1491 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1513 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1514 MaxLevel(MaxLevel) {}
1568 if (isa<LoadInst>(V1)) {
1570 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1575 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1577 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1580 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1583 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1585 ((
int)V1->getNumUses() == NumLanes ||
1586 AllUsersAreInternal(V1, V2)))
1592 auto CheckSameEntryOrFail = [&]() {
1593 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1594 TE1 && TE1 == R.getTreeEntry(V2))
1599 auto *LI1 = dyn_cast<LoadInst>(V1);
1600 auto *LI2 = dyn_cast<LoadInst>(V2);
1602 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1604 return CheckSameEntryOrFail();
1607 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1608 LI2->getPointerOperand(),
DL, SE,
true);
1609 if (!Dist || *Dist == 0) {
1612 R.TTI->isLegalMaskedGather(
1615 return CheckSameEntryOrFail();
1619 if (std::abs(*Dist) > NumLanes / 2)
1628 auto *C1 = dyn_cast<Constant>(V1);
1629 auto *C2 = dyn_cast<Constant>(V2);
1643 if (isa<UndefValue>(V2))
1647 Value *EV2 =
nullptr;
1660 int Dist = Idx2 - Idx1;
1663 if (std::abs(Dist) == 0)
1665 if (std::abs(Dist) > NumLanes / 2)
1672 return CheckSameEntryOrFail();
1675 auto *I1 = dyn_cast<Instruction>(V1);
1676 auto *I2 = dyn_cast<Instruction>(V2);
1678 if (I1->getParent() != I2->getParent())
1679 return CheckSameEntryOrFail();
1686 if (S.getOpcode() &&
1687 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1688 !S.isAltShuffle()) &&
1690 return cast<Instruction>(V)->getNumOperands() ==
1691 S.MainOp->getNumOperands();
1697 if (isa<UndefValue>(V2))
1700 return CheckSameEntryOrFail();
1734 int ShallowScoreAtThisLevel =
1743 auto *I1 = dyn_cast<Instruction>(
LHS);
1744 auto *I2 = dyn_cast<Instruction>(
RHS);
1745 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1747 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1748 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1749 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1750 ShallowScoreAtThisLevel))
1751 return ShallowScoreAtThisLevel;
1752 assert(I1 && I2 &&
"Should have early exited.");
1759 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1760 OpIdx1 != NumOperands1; ++OpIdx1) {
1762 int MaxTmpScore = 0;
1763 unsigned MaxOpIdx2 = 0;
1764 bool FoundBest =
false;
1768 ? I2->getNumOperands()
1769 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1770 assert(FromIdx <= ToIdx &&
"Bad index");
1771 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1773 if (Op2Used.
count(OpIdx2))
1778 I1, I2, CurrLevel + 1, std::nullopt);
1781 TmpScore > MaxTmpScore) {
1782 MaxTmpScore = TmpScore;
1789 Op2Used.
insert(MaxOpIdx2);
1790 ShallowScoreAtThisLevel += MaxTmpScore;
1793 return ShallowScoreAtThisLevel;
1824 struct OperandData {
1825 OperandData() =
default;
1826 OperandData(
Value *V,
bool APO,
bool IsUsed)
1827 : V(V), APO(APO), IsUsed(IsUsed) {}
1837 bool IsUsed =
false;
1846 enum class ReorderingMode {
1863 const Loop *L =
nullptr;
1866 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1867 return OpsVec[OpIdx][Lane];
1871 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1872 return OpsVec[OpIdx][Lane];
1877 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1878 OpIdx != NumOperands; ++OpIdx)
1879 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1881 OpsVec[OpIdx][Lane].IsUsed =
false;
1885 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1886 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1898 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1899 Value *IdxLaneV = getData(
Idx, Lane).V;
1900 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1903 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1906 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1907 if (!isa<Instruction>(OpIdxLnV))
1909 Uniques.
insert(OpIdxLnV);
1911 int UniquesCount = Uniques.
size();
1912 int UniquesCntWithIdxLaneV =
1913 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1914 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1915 int UniquesCntWithOpIdxLaneV =
1916 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1917 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1920 UniquesCntWithOpIdxLaneV) -
1921 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1930 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1931 Value *IdxLaneV = getData(
Idx, Lane).V;
1932 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1941 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1942 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1944 return R.areAllUsersVectorized(IdxLaneI)
1952 static const int ScoreScaleFactor = 10;
1960 int Lane,
unsigned OpIdx,
unsigned Idx,
1970 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1971 if (Score <= -SplatScore) {
1976 Score += SplatScore;
1982 Score *= ScoreScaleFactor;
1983 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2001 std::optional<unsigned>
2002 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2005 unsigned NumOperands = getNumOperands();
2008 Value *OpLastLane = getData(OpIdx, LastLane).V;
2011 ReorderingMode RMode = ReorderingModes[OpIdx];
2012 if (RMode == ReorderingMode::Failed)
2013 return std::nullopt;
2016 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2022 std::optional<unsigned>
Idx;
2026 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2032 bool IsUsed = RMode == ReorderingMode::Splat ||
2033 RMode == ReorderingMode::Constant ||
2034 RMode == ReorderingMode::Load;
2036 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2038 OperandData &OpData = getData(
Idx, Lane);
2040 bool OpAPO = OpData.APO;
2049 if (OpAPO != OpIdxAPO)
2054 case ReorderingMode::Load:
2055 case ReorderingMode::Opcode: {
2056 bool LeftToRight = Lane > LastLane;
2057 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2058 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2059 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2060 OpIdx,
Idx, IsUsed);
2061 if (Score >
static_cast<int>(BestOp.Score) ||
2062 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2065 BestOp.Score = Score;
2066 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2070 case ReorderingMode::Constant:
2071 if (isa<Constant>(
Op) ||
2072 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2074 if (isa<Constant>(
Op)) {
2076 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2079 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2083 case ReorderingMode::Splat:
2084 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2085 IsUsed =
Op == OpLastLane;
2086 if (
Op == OpLastLane) {
2088 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2094 case ReorderingMode::Failed:
2100 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2104 return std::nullopt;
2111 unsigned getBestLaneToStartReordering()
const {
2112 unsigned Min = UINT_MAX;
2113 unsigned SameOpNumber = 0;
2124 for (
int I = getNumLanes();
I > 0; --
I) {
2125 unsigned Lane =
I - 1;
2126 OperandsOrderData NumFreeOpsHash =
2127 getMaxNumOperandsThatCanBeReordered(Lane);
2130 if (NumFreeOpsHash.NumOfAPOs < Min) {
2131 Min = NumFreeOpsHash.NumOfAPOs;
2132 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2134 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2135 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2136 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2139 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2140 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2141 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2142 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2143 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
2144 if (It == HashMap.
end())
2145 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2151 unsigned BestLane = 0;
2152 unsigned CntMin = UINT_MAX;
2154 if (
Data.second.first < CntMin) {
2155 CntMin =
Data.second.first;
2156 BestLane =
Data.second.second;
2163 struct OperandsOrderData {
2166 unsigned NumOfAPOs = UINT_MAX;
2169 unsigned NumOpsWithSameOpcodeParent = 0;
2183 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2184 unsigned CntTrue = 0;
2185 unsigned NumOperands = getNumOperands();
2195 bool AllUndefs =
true;
2196 unsigned NumOpsWithSameOpcodeParent = 0;
2200 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2201 const OperandData &OpData = getData(OpIdx, Lane);
2206 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2208 I->getParent() != Parent) {
2209 if (NumOpsWithSameOpcodeParent == 0) {
2210 NumOpsWithSameOpcodeParent = 1;
2212 Parent =
I->getParent();
2214 --NumOpsWithSameOpcodeParent;
2217 ++NumOpsWithSameOpcodeParent;
2221 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2222 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2226 OperandsOrderData
Data;
2227 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2228 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2236 assert((empty() || VL.
size() == getNumLanes()) &&
2237 "Expected same number of lanes");
2238 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2239 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2240 constexpr unsigned IntrinsicNumOperands = 2;
2241 if (isa<IntrinsicInst>(VL[0]))
2242 NumOperands = IntrinsicNumOperands;
2243 OpsVec.
resize(NumOperands);
2244 unsigned NumLanes = VL.
size();
2245 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2246 OpsVec[OpIdx].
resize(NumLanes);
2247 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2248 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2259 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2260 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2261 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2268 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2271 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2274 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2275 return getData(OpIdx, Lane).V;
2279 bool empty()
const {
return OpsVec.
empty(); }
2282 void clear() { OpsVec.
clear(); }
2287 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2288 bool OpAPO = getData(OpIdx, Lane).APO;
2289 bool IsInvariant = L && L->isLoopInvariant(
Op);
2291 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2295 bool FoundCandidate =
false;
2296 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2297 OperandData &
Data = getData(OpI, Ln);
2298 if (
Data.APO != OpAPO ||
Data.IsUsed)
2300 Value *OpILane = getValue(OpI, Lane);
2301 bool IsConstantOp = isa<Constant>(OpILane);
2310 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2316 isa<Constant>(
Data.V)))) ||
2323 (IsInvariant && !isa<Constant>(
Data.V) &&
2325 L->isLoopInvariant(
Data.V))) {
2326 FoundCandidate =
true;
2333 if (!FoundCandidate)
2336 return getNumLanes() == 2 || Cnt > 1;
2341 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2342 bool OpAPO = getData(OpIdx, Lane).APO;
2343 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2346 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2347 const OperandData &
Data = getData(OpI, Ln);
2348 if (
Data.APO != OpAPO ||
Data.IsUsed)
2350 Value *OpILn = getValue(OpI, Ln);
2351 return (L && L->isLoopInvariant(OpILn)) ||
2353 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2363 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2367 appendOperandsOfVL(RootVL);
2374 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2375 "Expected same num of lanes across all operands");
2376 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2377 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2385 unsigned NumOperands = getNumOperands();
2386 unsigned NumLanes = getNumLanes();
2406 unsigned FirstLane = getBestLaneToStartReordering();
2409 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2410 Value *OpLane0 = getValue(OpIdx, FirstLane);
2413 if (isa<LoadInst>(OpLane0))
2414 ReorderingModes[OpIdx] = ReorderingMode::Load;
2415 else if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2417 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2418 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2419 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2421 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2422 }
else if (isa<Constant>(OpLane0))
2423 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2424 else if (isa<Argument>(OpLane0))
2426 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2429 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2436 auto &&SkipReordering = [
this]() {
2439 for (
const OperandData &
Data : Op0)
2442 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2461 if (SkipReordering())
2464 bool StrategyFailed =
false;
2472 for (
unsigned I = 0;
I < NumOperands; ++
I)
2473 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2475 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2478 int Lane = FirstLane +
Direction * Distance;
2479 if (Lane < 0 || Lane >= (
int)NumLanes)
2482 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2485 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2487 std::optional<unsigned> BestIdx = getBestOperand(
2488 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2495 swap(OpIdx, *BestIdx, Lane);
2498 StrategyFailed =
true;
2501 if (MainAltOps[OpIdx].
size() != 2) {
2502 OperandData &AltOp = getData(OpIdx, Lane);
2503 InstructionsState OpS =
2505 if (OpS.getOpcode() && OpS.isAltShuffle())
2512 if (!StrategyFailed)
2517#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2520 case ReorderingMode::Load:
2522 case ReorderingMode::Opcode:
2524 case ReorderingMode::Constant:
2526 case ReorderingMode::Splat:
2528 case ReorderingMode::Failed:
2549 const unsigned Indent = 2;
2552 OS <<
"Operand " << Cnt++ <<
"\n";
2553 for (
const OperandData &OpData : OpDataVec) {
2555 if (
Value *V = OpData.V)
2559 OS <<
", APO:" << OpData.APO <<
"}\n";
2581 int BestScore = Limit;
2582 std::optional<int>
Index;
2583 for (
int I : seq<int>(0, Candidates.size())) {
2585 Candidates[
I].second,
2588 if (Score > BestScore) {
2603 DeletedInstructions.insert(
I);
2608 template <
typename T>
2611 for (
T *V : DeadVals) {
2612 auto *
I = cast<Instruction>(V);
2613 DeletedInstructions.insert(
I);
2616 for (
T *V : DeadVals) {
2617 if (!V || !Processed.
insert(V).second)
2619 auto *
I = cast<Instruction>(V);
2622 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2623 Entries.push_back(Entry);
2624 auto It = MultiNodeScalars.find(
I);
2625 if (It != MultiNodeScalars.end())
2626 Entries.append(It->second.begin(), It->second.end());
2628 for (
Use &U :
I->operands()) {
2629 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2630 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2632 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2633 return Entry->VectorizedValue == OpI;
2637 I->dropAllReferences();
2639 for (
T *V : DeadVals) {
2640 auto *
I = cast<Instruction>(V);
2641 if (!
I->getParent())
2646 cast<Instruction>(U.getUser()));
2648 "trying to erase instruction with users.");
2649 I->removeFromParent();
2653 while (!DeadInsts.
empty()) {
2656 if (!VI || !VI->getParent())
2659 "Live instruction found in dead worklist!");
2660 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2667 for (
Use &OpU : VI->operands()) {
2668 Value *OpV = OpU.get();
2679 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2680 if (!DeletedInstructions.contains(OpI) &&
2685 VI->removeFromParent();
2686 DeletedInstructions.insert(VI);
2694 return AnalyzedReductionsRoots.count(
I);
2699 AnalyzedReductionsRoots.insert(
I);
2713 AnalyzedReductionsRoots.clear();
2714 AnalyzedReductionVals.
clear();
2715 AnalyzedMinBWVals.
clear();
2727 return NonScheduledFirst.
contains(V);
2740 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2744 unsigned &MaxDepthLevel,
2745 bool &IsProfitableToDemote,
2746 bool IsTruncRoot)
const;
2756 canReorderOperands(TreeEntry *UserTE,
2763 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2767 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2769 TreeEntry *TE =
nullptr;
2771 TE = getTreeEntry(V);
2772 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2774 auto It = MultiNodeScalars.find(V);
2775 if (It != MultiNodeScalars.end()) {
2776 for (TreeEntry *E : It->second) {
2777 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2785 if (It != VL.
end()) {
2786 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2794 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2795 unsigned OpIdx)
const {
2796 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2797 const_cast<TreeEntry *
>(UserTE), OpIdx);
2801 bool areAllUsersVectorized(
2810 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2814 getCastContextHint(
const TreeEntry &TE)
const;
2823 const EdgeInfo &EI);
2834 bool ResizeAllowed =
false)
const;
2845 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2850 template <
typename BVTy,
typename ResTy,
typename...
Args>
2851 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2856 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2862 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2869 std::optional<TargetTransformInfo::ShuffleKind>
2881 unsigned NumParts)
const;
2893 std::optional<TargetTransformInfo::ShuffleKind>
2894 isGatherShuffledSingleRegisterEntry(
2911 isGatherShuffledEntry(
2914 unsigned NumParts,
bool ForOrder =
false);
2921 Type *ScalarTy)
const;
2925 void setInsertPointAfterBundle(
const TreeEntry *E);
2933 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2946 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2962 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2966 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2983 [Scalars](
Value *V,
int Idx) {
2984 return (isa<UndefValue>(V) &&
2985 Idx == PoisonMaskElem) ||
2986 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2989 if (!ReorderIndices.empty()) {
2996 return IsSame(Scalars, Mask);
2997 if (VL.
size() == ReuseShuffleIndices.size()) {
2999 return IsSame(Scalars, Mask);
3003 return IsSame(Scalars, ReuseShuffleIndices);
3006 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3007 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3008 UserTreeIndices.front().UserTE == UserEI.UserTE;
3012 bool hasEqualOperands(
const TreeEntry &TE)
const {
3013 if (
TE.getNumOperands() != getNumOperands())
3016 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3017 unsigned PrevCount =
Used.count();
3018 for (
unsigned K = 0;
K < E; ++
K) {
3021 if (getOperand(K) ==
TE.getOperand(
I)) {
3027 if (PrevCount ==
Used.count())
3036 unsigned getVectorFactor()
const {
3037 if (!ReuseShuffleIndices.empty())
3038 return ReuseShuffleIndices.size();
3039 return Scalars.
size();
3043 bool isGather()
const {
return State == NeedToGather; }
3070 enum CombinedOpcode {
3072 MinMax = Instruction::OtherOpsEnd + 1,
3074 CombinedOpcode CombinedOp = NotCombinedOp;
3088 VecTreeTy &Container;
3112 assert(Operands[OpIdx].empty() &&
"Already resized?");
3114 "Number of operands is greater than the number of scalars.");
3120 void setOperandsInOrder() {
3122 auto *I0 = cast<Instruction>(Scalars[0]);
3123 Operands.resize(I0->getNumOperands());
3124 unsigned NumLanes = Scalars.size();
3125 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3126 OpIdx != NumOperands; ++OpIdx) {
3128 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3129 auto *
I = cast<Instruction>(Scalars[Lane]);
3130 assert(
I->getNumOperands() == NumOperands &&
3131 "Expected same number of operands");
3132 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
3156 unsigned getNumOperands()
const {
return Operands.size(); }
3159 Value *getSingleOperand(
unsigned OpIdx)
const {
3161 assert(!Operands[OpIdx].empty() &&
"No operand available");
3166 bool isAltShuffle()
const {
return MainOp != AltOp; }
3169 unsigned CheckedOpcode =
I->getOpcode();
3170 return (getOpcode() == CheckedOpcode ||
3171 getAltOpcode() == CheckedOpcode);
3178 auto *
I = dyn_cast<Instruction>(
Op);
3179 if (
I && isOpcodeOrAlt(
I))
3184 void setOperations(
const InstructionsState &S) {
3198 unsigned getOpcode()
const {
3199 return MainOp ? MainOp->
getOpcode() : 0;
3202 unsigned getAltOpcode()
const {
3208 int findLaneForValue(
Value *V)
const {
3209 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
3210 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3211 if (!ReorderIndices.
empty())
3212 FoundLane = ReorderIndices[FoundLane];
3213 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3214 if (!ReuseShuffleIndices.
empty()) {
3215 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
3216 find(ReuseShuffleIndices, FoundLane));
3230 bool isNonPowOf2Vec()
const {
3232 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3233 "Reshuffling not supported with non-power-of-2 vectors yet.");
3234 return IsNonPowerOf2;
3241 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3242 dbgs() <<
"Operand " << OpI <<
":\n";
3243 for (
const Value *V : Operands[OpI])
3246 dbgs() <<
"Scalars: \n";
3247 for (
Value *V : Scalars)
3249 dbgs() <<
"State: ";
3252 dbgs() <<
"Vectorize\n";
3254 case ScatterVectorize:
3255 dbgs() <<
"ScatterVectorize\n";
3257 case StridedVectorize:
3258 dbgs() <<
"StridedVectorize\n";
3261 dbgs() <<
"NeedToGather\n";
3263 case CombinedVectorize:
3264 dbgs() <<
"CombinedVectorize\n";
3267 dbgs() <<
"MainOp: ";
3269 dbgs() << *MainOp <<
"\n";
3272 dbgs() <<
"AltOp: ";
3274 dbgs() << *AltOp <<
"\n";
3277 dbgs() <<
"VectorizedValue: ";
3278 if (VectorizedValue)
3279 dbgs() << *VectorizedValue <<
"\n";
3282 dbgs() <<
"ReuseShuffleIndices: ";
3283 if (ReuseShuffleIndices.
empty())
3286 for (
int ReuseIdx : ReuseShuffleIndices)
3287 dbgs() << ReuseIdx <<
", ";
3289 dbgs() <<
"ReorderIndices: ";
3290 for (
unsigned ReorderIdx : ReorderIndices)
3291 dbgs() << ReorderIdx <<
", ";
3293 dbgs() <<
"UserTreeIndices: ";
3294 for (
const auto &EInfo : UserTreeIndices)
3295 dbgs() << EInfo <<
", ";
3302 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3305 dbgs() <<
"SLP: " << Banner <<
":\n";
3307 dbgs() <<
"SLP: Costs:\n";
3308 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3309 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3310 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3311 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3312 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3318 std::optional<ScheduleData *> Bundle,
3319 const InstructionsState &S,
3320 const EdgeInfo &UserTreeIdx,
3323 TreeEntry::EntryState EntryState =
3324 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3325 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3326 ReuseShuffleIndices, ReorderIndices);
3330 TreeEntry::EntryState EntryState,
3331 std::optional<ScheduleData *> Bundle,
3332 const InstructionsState &S,
3333 const EdgeInfo &UserTreeIdx,
3336 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3337 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3338 "Need to vectorize gather entry?");
3339 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3340 TreeEntry *
Last = VectorizableTree.
back().get();
3341 Last->Idx = VectorizableTree.
size() - 1;
3342 Last->State = EntryState;
3343 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3344 ReuseShuffleIndices.end());
3345 if (ReorderIndices.
empty()) {
3347 Last->setOperations(S);
3350 Last->Scalars.assign(VL.
size(),
nullptr);
3353 if (Idx >= VL.size())
3354 return UndefValue::get(VL.front()->getType());
3358 Last->setOperations(S);
3359 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3361 if (!
Last->isGather()) {
3362 for (
Value *V : VL) {
3363 const TreeEntry *
TE = getTreeEntry(V);
3365 "Scalar already in tree!");
3368 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3371 ScalarToTreeEntry[
V] =
Last;
3374 ScheduleData *BundleMember = *Bundle;
3375 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3378 "Bundle and VL out of sync");
3380 for (
Value *V : VL) {
3385 BundleMember->TE =
Last;
3386 BundleMember = BundleMember->NextInBundle;
3389 assert(!BundleMember &&
"Bundle and VL out of sync");
3392 bool AllConstsOrCasts =
true;
3395 auto *
I = dyn_cast<CastInst>(V);
3396 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3399 if (AllConstsOrCasts)
3401 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3402 MustGather.
insert(VL.begin(), VL.end());
3405 if (UserTreeIdx.UserTE) {
3406 Last->UserTreeIndices.push_back(UserTreeIdx);
3407 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3408 "Reordering isn't implemented for non-power-of-2 nodes yet");
3415 TreeEntry::VecTreeTy VectorizableTree;
3420 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3421 VectorizableTree[
Id]->dump();
3427 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3429 const TreeEntry *getTreeEntry(
Value *V)
const {
3430 return ScalarToTreeEntry.lookup(V);
3439 bool areAltOperandsProfitable(
const InstructionsState &S,
3444 TreeEntry::EntryState getScalarsVectorizationState(
3477 using ValueToGatherNodesMap =
3479 ValueToGatherNodesMap ValueToGatherNodes;
3482 struct ExternalUser {
3506 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3507 auto It = AliasCache.
find(Key);
3508 if (It != AliasCache.
end())
3513 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3517 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3549 UserList ExternalUses;
3569 struct ScheduleData {
3572 enum { InvalidDeps = -1 };
3574 ScheduleData() =
default;
3577 FirstInBundle =
this;
3578 NextInBundle =
nullptr;
3579 NextLoadStore =
nullptr;
3580 IsScheduled =
false;
3581 SchedulingRegionID = BlockSchedulingRegionID;
3582 clearDependencies();
3589 if (hasValidDependencies()) {
3590 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3592 assert(UnscheduledDeps == Dependencies &&
"invariant");
3596 assert(isSchedulingEntity() &&
3597 "unexpected scheduled state");
3598 for (
const ScheduleData *BundleMember =
this; BundleMember;
3599 BundleMember = BundleMember->NextInBundle) {
3600 assert(BundleMember->hasValidDependencies() &&
3601 BundleMember->UnscheduledDeps == 0 &&
3602 "unexpected scheduled state");
3603 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3604 "only bundle is marked scheduled");
3608 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3609 "all bundle members must be in same basic block");
3615 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3619 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3623 bool isPartOfBundle()
const {
3624 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3629 bool isReady()
const {
3630 assert(isSchedulingEntity() &&
3631 "can't consider non-scheduling entity for ready list");
3632 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3638 int incrementUnscheduledDeps(
int Incr) {
3639 assert(hasValidDependencies() &&
3640 "increment of unscheduled deps would be meaningless");
3641 UnscheduledDeps += Incr;
3642 return FirstInBundle->unscheduledDepsInBundle();
3647 void resetUnscheduledDeps() {
3648 UnscheduledDeps = Dependencies;
3652 void clearDependencies() {
3653 Dependencies = InvalidDeps;
3654 resetUnscheduledDeps();
3655 MemoryDependencies.clear();
3656 ControlDependencies.clear();
3659 int unscheduledDepsInBundle()
const {
3660 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3662 for (
const ScheduleData *BundleMember =
this; BundleMember;
3663 BundleMember = BundleMember->NextInBundle) {
3664 if (BundleMember->UnscheduledDeps == InvalidDeps)
3666 Sum += BundleMember->UnscheduledDeps;
3672 if (!isSchedulingEntity()) {
3673 os <<
"/ " << *Inst;
3674 }
else if (NextInBundle) {
3676 ScheduleData *SD = NextInBundle;
3678 os <<
';' << *SD->Inst;
3679 SD = SD->NextInBundle;
3690 TreeEntry *
TE =
nullptr;
3694 ScheduleData *FirstInBundle =
nullptr;
3698 ScheduleData *NextInBundle =
nullptr;
3702 ScheduleData *NextLoadStore =
nullptr;
3716 int SchedulingRegionID = 0;
3719 int SchedulingPriority = 0;
3725 int Dependencies = InvalidDeps;
3731 int UnscheduledDeps = InvalidDeps;
3735 bool IsScheduled =
false;
3740 const BoUpSLP::ScheduleData &SD) {
3765 struct BlockScheduling {
3767 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3771 ScheduleStart =
nullptr;
3772 ScheduleEnd =
nullptr;
3773 FirstLoadStoreInRegion =
nullptr;
3774 LastLoadStoreInRegion =
nullptr;
3775 RegionHasStackSave =
false;
3779 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3782 ScheduleRegionSize = 0;
3786 ++SchedulingRegionID;
3790 if (BB !=
I->getParent())
3793 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3794 if (SD && isInSchedulingRegion(SD))
3799 ScheduleData *getScheduleData(
Value *V) {
3800 if (
auto *
I = dyn_cast<Instruction>(V))
3801 return getScheduleData(
I);
3805 bool isInSchedulingRegion(ScheduleData *SD)
const {
3806 return SD->SchedulingRegionID == SchedulingRegionID;
3811 template <
typename ReadyListType>
3812 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3813 SD->IsScheduled =
true;
3816 for (ScheduleData *BundleMember = SD; BundleMember;
3817 BundleMember = BundleMember->NextInBundle) {
3822 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3823 ScheduleData *OpDef = getScheduleData(
I);
3824 if (OpDef && OpDef->hasValidDependencies() &&
3825 OpDef->incrementUnscheduledDeps(-1) == 0) {
3829 ScheduleData *DepBundle = OpDef->FirstInBundle;
3830 assert(!DepBundle->IsScheduled &&
3831 "already scheduled bundle gets ready");
3832 ReadyList.insert(DepBundle);
3834 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3841 if (TreeEntry *TE = BundleMember->TE) {
3843 int Lane = std::distance(
TE->Scalars.begin(),
3844 find(
TE->Scalars, BundleMember->Inst));
3845 assert(Lane >= 0 &&
"Lane not set");
3853 auto *
In = BundleMember->Inst;
3856 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3857 In->getNumOperands() ==
TE->getNumOperands()) &&
3858 "Missed TreeEntry operands?");
3861 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3862 OpIdx != NumOperands; ++OpIdx)
3863 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3868 for (
Use &U : BundleMember->Inst->operands())
3869 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3873 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3874 if (MemoryDepSD->hasValidDependencies() &&
3875 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3878 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3879 assert(!DepBundle->IsScheduled &&
3880 "already scheduled bundle gets ready");
3881 ReadyList.insert(DepBundle);
3883 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3887 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3888 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3891 ScheduleData *DepBundle = DepSD->FirstInBundle;
3892 assert(!DepBundle->IsScheduled &&
3893 "already scheduled bundle gets ready");
3894 ReadyList.insert(DepBundle);
3896 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3907 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3908 ScheduleStart->comesBefore(ScheduleEnd) &&
3909 "Not a valid scheduling region?");
3911 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3912 auto *SD = getScheduleData(
I);
3915 assert(isInSchedulingRegion(SD) &&
3916 "primary schedule data not in window?");
3917 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3918 "entire bundle in window!");
3922 for (
auto *SD : ReadyInsts) {
3923 assert(SD->isSchedulingEntity() && SD->isReady() &&
3924 "item in ready list not ready?");
3930 template <
typename ReadyListType>
3931 void initialFillReadyList(ReadyListType &ReadyList) {
3932 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3933 ScheduleData *SD = getScheduleData(
I);
3934 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3936 ReadyList.insert(SD);
3938 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3952 std::optional<ScheduleData *>
3954 const InstructionsState &S);
3960 ScheduleData *allocateScheduleDataChunks();
3964 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3969 ScheduleData *PrevLoadStore,
3970 ScheduleData *NextLoadStore);
3974 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3978 void resetSchedule();
4008 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4012 ScheduleData *LastLoadStoreInRegion =
nullptr;
4017 bool RegionHasStackSave =
false;
4020 int ScheduleRegionSize = 0;
4029 int SchedulingRegionID = 1;
4037 void scheduleBlock(BlockScheduling *BS);
4044 struct OrdersTypeDenseMapInfo {
4057 static unsigned getHashValue(
const OrdersType &V) {
4078 unsigned MaxVecRegSize;
4079 unsigned MinVecRegSize;
4094 unsigned ReductionBitWidth = 0;
4098 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4117 struct ChildIteratorType
4119 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4130 return R.VectorizableTree[0].get();
4134 return {
N->UserTreeIndices.begin(),
N->Container};
4138 return {
N->UserTreeIndices.end(),
N->Container};
4143 class nodes_iterator {
4154 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4158 return nodes_iterator(R->VectorizableTree.begin());
4162 return nodes_iterator(R->VectorizableTree.end());
4165 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4176 OS << Entry->Idx <<
".\n";
4179 for (
auto *V : Entry->Scalars) {
4181 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4182 return EU.Scalar == V;
4192 if (Entry->isGather())
4194 if (Entry->State == TreeEntry::ScatterVectorize ||
4195 Entry->State == TreeEntry::StridedVectorize)
4196 return "color=blue";
4205 for (
auto *
I : DeletedInstructions) {
4206 if (!
I->getParent()) {
4209 if (isa<PHINode>(
I))
4211 I->insertBefore(
F->getEntryBlock(),
4212 F->getEntryBlock().getFirstNonPHIIt());
4214 I->insertBefore(
F->getEntryBlock().getTerminator());
4217 for (
Use &U :
I->operands()) {
4218 auto *
Op = dyn_cast<Instruction>(U.get());
4219 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4223 I->dropAllReferences();
4225 for (
auto *
I : DeletedInstructions) {
4227 "trying to erase instruction with users.");
4228 I->eraseFromParent();
4234#ifdef EXPENSIVE_CHECKS
4245 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4246 "Expected non-empty mask.");
4249 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4251 Reuses[Mask[
I]] = Prev[
I];
4259 bool BottomOrder =
false) {
4260 assert(!Mask.empty() &&
"Expected non-empty mask.");
4261 unsigned Sz = Mask.size();
4264 if (Order.
empty()) {
4266 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4268 PrevOrder.
swap(Order);
4271 for (
unsigned I = 0;
I < Sz; ++
I)
4273 Order[
I] = PrevOrder[Mask[
I]];
4275 return Data.value() == Sz ||
Data.index() ==
Data.value();
4284 if (Order.
empty()) {
4286 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4296 for (
unsigned I = 0;
I < Sz; ++
I)
4298 Order[MaskOrder[
I]] =
I;
4302std::optional<BoUpSLP::OrdersType>
4304 assert(TE.isGather() &&
"Expected gather node only.");
4308 Type *ScalarTy = GatheredScalars.
front()->getType();
4309 int NumScalars = GatheredScalars.
size();
4311 return std::nullopt;
4314 if (NumParts == 0 || NumParts >= NumScalars)
4320 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4322 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4325 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4326 return std::nullopt;
4327 OrdersType CurrentOrder(NumScalars, NumScalars);
4328 if (GatherShuffles.
size() == 1 &&
4330 Entries.front().front()->isSame(TE.Scalars)) {
4333 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4334 return CurrentOrder;
4338 return all_of(Mask, [&](
int I) {
4345 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4346 (Entries.size() != 1 ||
4347 Entries.front().front()->ReorderIndices.empty())) ||
4348 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4349 return std::nullopt;
4354 for (
int I : seq<int>(0, NumParts)) {
4355 if (ShuffledSubMasks.
test(
I))
4357 const int VF = GetVF(
I);
4363 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4364 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4365 ShuffledSubMasks.
set(
I);
4369 int FirstMin = INT_MAX;
4370 int SecondVecFound =
false;
4371 for (
int K : seq<int>(Limit)) {
4372 int Idx = Mask[
I * PartSz + K];
4374 Value *V = GatheredScalars[
I * PartSz + K];
4376 SecondVecFound =
true;
4385 SecondVecFound =
true;
4389 FirstMin = (FirstMin / PartSz) * PartSz;
4391 if (SecondVecFound) {
4392 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4393 ShuffledSubMasks.
set(
I);
4396 for (
int K : seq<int>(Limit)) {
4397 int Idx = Mask[
I * PartSz + K];
4401 if (
Idx >= PartSz) {
4402 SecondVecFound =
true;
4405 if (CurrentOrder[
I * PartSz +
Idx] >
4406 static_cast<unsigned>(
I * PartSz + K) &&
4407 CurrentOrder[
I * PartSz +
Idx] !=
4408 static_cast<unsigned>(
I * PartSz +
Idx))
4409 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4412 if (SecondVecFound) {
4413 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4414 ShuffledSubMasks.
set(
I);
4420 if (!ExtractShuffles.
empty())
4421 TransformMaskToOrder(
4422 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4423 if (!ExtractShuffles[
I])
4426 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4427 for (
unsigned Idx : seq<unsigned>(Sz)) {
4428 int K =
I * PartSz +
Idx;
4431 if (!TE.ReuseShuffleIndices.empty())
4432 K = TE.ReuseShuffleIndices[K];
4433 if (!TE.ReorderIndices.empty())
4434 K = std::distance(TE.ReorderIndices.begin(),
4435 find(TE.ReorderIndices, K));
4436 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4439 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4441 .getKnownMinValue());
4446 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4447 if (ShuffledSubMasks.
any())
4448 return std::nullopt;
4449 PartSz = NumScalars;
4452 if (!Entries.empty())
4453 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4454 if (!GatherShuffles[
I])
4456 return std::max(Entries[
I].front()->getVectorFactor(),
4457 Entries[
I].back()->getVectorFactor());
4460 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4461 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4462 return std::nullopt;
4463 return std::move(CurrentOrder);
4468 bool CompareOpcodes =
true) {
4471 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4474 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4477 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4481 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4486template <
typename T>
4488 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4490 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4491 return CommonAlignment;
4496 unsigned Sz = Order.
size();
4498 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4509static std::optional<Value *>
4515 const SCEV *PtrSCEVLowest =
nullptr;
4516 const SCEV *PtrSCEVHighest =
nullptr;
4522 return std::nullopt;
4524 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4525 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4529 if (isa<SCEVCouldNotCompute>(Diff))
4530 return std::nullopt;
4532 PtrSCEVLowest = PtrSCEV;
4536 if (isa<SCEVCouldNotCompute>(Diff1))
4537 return std::nullopt;
4539 PtrSCEVHighest = PtrSCEV;
4545 if (isa<SCEVCouldNotCompute>(Dist))
4546 return std::nullopt;
4547 int Size =
DL.getTypeStoreSize(ElemTy);
4548 auto TryGetStride = [&](
const SCEV *Dist,
4549 const SCEV *Multiplier) ->
const SCEV * {
4550 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4551 if (M->getOperand(0) == Multiplier)
4552 return M->getOperand(1);
4553 if (M->getOperand(1) == Multiplier)
4554 return M->getOperand(0);
4557 if (Multiplier == Dist)
4562 const SCEV *Stride =
nullptr;
4563 if (
Size != 1 || SCEVs.
size() > 2) {
4565 Stride = TryGetStride(Dist, Sz);
4567 return std::nullopt;
4569 if (!Stride || isa<SCEVConstant>(Stride))
4570 return std::nullopt;
4573 using DistOrdPair = std::pair<int64_t, int>;
4575 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4577 bool IsConsecutive =
true;
4578 for (
const SCEV *PtrSCEV : SCEVs) {
4580 if (PtrSCEV != PtrSCEVLowest) {
4582 const SCEV *Coeff = TryGetStride(Diff, Stride);
4584 return std::nullopt;
4585 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4586 if (!SC || isa<SCEVCouldNotCompute>(SC))
4587 return std::nullopt;
4591 return std::nullopt;
4592 Dist = SC->getAPInt().getZExtValue();
4596 return std::nullopt;
4597 auto Res = Offsets.emplace(Dist, Cnt);
4599 return std::nullopt;
4601 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4604 if (Offsets.size() != SCEVs.
size())
4605 return std::nullopt;
4606 SortedIndices.
clear();
4607 if (!IsConsecutive) {
4611 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4612 SortedIndices[Cnt] = Pair.second;
4622static std::pair<InstructionCost, InstructionCost>
4638 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4641 Mask, NumSrcElts, NumSubElts,
Index)) {
4642 if (
Index + NumSubElts > NumSrcElts &&
4643 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
4663 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4669 const unsigned Sz = VL.
size();
4671 auto *POIter = PointerOps.
begin();
4672 for (
Value *V : VL) {
4673 auto *L = cast<LoadInst>(V);
4676 *POIter = L->getPointerOperand();
4687 "supported with VectorizeNonPowerOf2");
4691 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4702 if (Order.
empty()) {
4703 Ptr0 = PointerOps.
front();
4704 PtrN = PointerOps.
back();
4706 Ptr0 = PointerOps[Order.
front()];
4707 PtrN = PointerOps[Order.
back()];
4709 std::optional<int> Diff =
4712 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4715 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4729 auto IsAnyPointerUsedOutGraph =
4730 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
4731 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
4732 return !getTreeEntry(U) && !MustGather.contains(U);
4735 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4737 (
static_cast<unsigned>(std::abs(*Diff)) <=
4740 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4741 *Diff == -(
static_cast<int>(Sz) - 1))) {
4742 int Stride = *Diff /
static_cast<int>(Sz - 1);
4743 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4755 else if (
Ptr != Ptr0)
4760 if (((Dist / Stride) * Stride) != Dist ||
4761 !Dists.
insert(Dist).second)
4764 if (Dists.
size() == Sz)
4770 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4771 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4773 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4774 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4775 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4776 unsigned VectorizedCnt = 0;
4778 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4779 Cnt += VF, ++VectorizedCnt) {
4797 if (VectorizedCnt == VL.
size() / VF) {
4800 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4801 TTI, PointerOps, PointerOps.
front(), Instruction::GetElementPtr,
4805 Instruction::Load, VecTy,
4807 false, CommonAlignment,
CostKind) +
4808 VectorGEPCost - ScalarGEPCost;
4812 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4815 auto [ScalarGEPCost, VectorGEPCost] =
4817 LI0->getPointerOperand(), Instruction::Load,
4820 Instruction::Load, SubVecTy, LI0->getAlign(),
4821 LI0->getPointerAddressSpace(),
CostKind,
4823 VectorGEPCost - ScalarGEPCost;
4827 auto [ScalarGEPCost, VectorGEPCost] =
4829 LI0->getPointerOperand(), Instruction::Load,
4833 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4834 false, CommonAlignment,
CostKind) +
4835 VectorGEPCost - ScalarGEPCost;
4839 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4841 LI0->getPointerOperand(), Instruction::GetElementPtr,
4845 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4846 false, CommonAlignment,
CostKind) +
4847 VectorGEPCost - ScalarGEPCost;
4852 "Expected only consecutive, strided or masked gather loads.");
4855 for (
int Idx : seq<int>(0, VL.
size()))
4859 ShuffleMask,
CostKind,
I * VF, SubVecTy);
4864 if (MaskedGatherCost >= VecLdCost)
4874 bool ProfitableGatherPointers =
4877 return L->isLoopInvariant(V);
4879 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4880 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4882 (
GEP &&
GEP->getNumOperands() == 2 &&
4883 isa<Constant, Instruction>(
GEP->getOperand(1)));
4885 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4890 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4909 "Expected list of pointer operands.");
4914 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4919 std::optional<int> Diff =
4925 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4931 if (Bases.
size() > VL.
size() / 2 - 1)
4935 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4941 bool AnyConsecutive =
false;
4942 for (
auto &
Base : Bases) {
4943 auto &Vec =
Base.second;
4944 if (Vec.size() > 1) {
4946 const std::tuple<Value *, int, unsigned> &
Y) {
4947 return std::get<1>(
X) < std::get<1>(
Y);
4949 int InitialOffset = std::get<1>(Vec[0]);
4951 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4957 SortedIndices.
clear();
4958 if (!AnyConsecutive)
4966 for (
auto &
Base : Bases) {
4967 Value *Strip =
Base.first->stripInBoundsConstantOffsets();
4968 Value *Root = Strip;
4969 while (
auto *Gep = dyn_cast<GetElementPtrInst>(Root))
4970 Root = Gep->getOperand(0);
4973 auto *Begin = SortedBases.
begin();
4974 auto *
End = SortedBases.
end();
4975 while (Begin !=
End) {
4976 Value *Root = std::get<2>(*Begin);
4977 auto *Mid = std::stable_partition(
4978 Begin,
End, [&Root](
auto V) {
return std::get<2>(V) == Root; });
4980 for (
auto I = Begin;
I < Mid; ++
I)
4981 LessThan.try_emplace(std::get<1>(*
I));
4982 for (
auto I = Begin;
I < Mid; ++
I) {
4983 Value *V = std::get<1>(*
I);
4984 while (
auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
4985 V = Gep->getOperand(0);
4986 if (LessThan.contains(V))
4987 LessThan[V][std::get<1>(*
I)] =
true;
4990 std::stable_sort(Begin, Mid, [&LessThan](
auto &V1,
auto &V2) {
4991 return LessThan[std::get<1>(V1)][std::get<1>(V2)];
4997 for (
auto Base : SortedBases)
4998 for (
auto &
T : Bases[std::get<0>(
Base)])
5002 "Expected SortedIndices to be the size of VL");
5006std::optional<BoUpSLP::OrdersType>
5008 assert(TE.isGather() &&
"Expected gather node only.");
5009 Type *ScalarTy = TE.Scalars[0]->getType();
5012 Ptrs.
reserve(TE.Scalars.size());
5013 for (
Value *V : TE.Scalars) {
5014 auto *L = dyn_cast<LoadInst>(V);
5015 if (!L || !L->isSimple())
5016 return std::nullopt;
5022 return std::move(Order);
5023 return std::nullopt;
5034 if (VU->
getType() != V->getType())
5037 if (!VU->
hasOneUse() && !V->hasOneUse())
5043 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5049 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5050 bool IsReusedIdx =
false;
5052 if (IE2 == VU && !IE1)
5054 if (IE1 == V && !IE2)
5055 return V->hasOneUse();
5056 if (IE1 && IE1 != V) {
5058 IsReusedIdx |= ReusedIdx.
test(Idx1);
5059 ReusedIdx.
set(Idx1);
5060 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5063 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5065 if (IE2 && IE2 != VU) {
5067 IsReusedIdx |= ReusedIdx.
test(Idx2);
5068 ReusedIdx.
set(Idx2);
5069 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5072 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5074 }
while (!IsReusedIdx && (IE1 || IE2));
5078std::optional<BoUpSLP::OrdersType>
5081 if (TE.isNonPowOf2Vec())
5082 return std::nullopt;
5086 if (!TE.ReuseShuffleIndices.empty()) {
5088 return std::nullopt;
5096 unsigned Sz = TE.Scalars.size();
5097 if (TE.isGather()) {
5098 if (std::optional<OrdersType> CurrentOrder =
5103 ::addMask(Mask, TE.ReuseShuffleIndices);
5104 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5105 unsigned Sz = TE.Scalars.size();
5106 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5109 Res[
Idx + K * Sz] =
I + K * Sz;
5111 return std::move(Res);
5114 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5116 2 * TE.getVectorFactor())) == 1)
5117 return std::nullopt;
5121 if (TE.ReorderIndices.empty())
5122 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5125 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5126 unsigned VF = ReorderMask.
size();
5130 for (
unsigned I = 0;
I < VF;
I += Sz) {
5132 unsigned UndefCnt = 0;
5133 unsigned Limit = std::min(Sz, VF -
I);
5142 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5144 return std::nullopt;
5146 for (
unsigned K = 0; K < NumParts; ++K)
5147 ResOrder[Val + Sz * K] =
I + K;
5149 return std::move(ResOrder);
5151 unsigned VF = TE.getVectorFactor();
5154 TE.ReuseShuffleIndices.end());
5155 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5157 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5158 return Idx && *Idx < Sz;
5161 if (TE.ReorderIndices.empty())
5162 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5165 for (
unsigned I = 0;
I < VF; ++
I) {
5166 int &
Idx = ReusedMask[
I];
5169 Value *V = TE.Scalars[ReorderMask[
Idx]];
5171 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5177 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5178 auto *It = ResOrder.
begin();
5179 for (
unsigned K = 0; K < VF; K += Sz) {
5183 std::iota(SubMask.begin(), SubMask.end(), 0);
5185 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5186 std::advance(It, Sz);
5189 return Data.index() ==
Data.value();
5191 return std::nullopt;
5192 return std::move(ResOrder);
5194 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5195 any_of(TE.UserTreeIndices,
5197 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5199 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5200 return std::nullopt;
5201 if ((TE.State == TreeEntry::Vectorize ||
5202 TE.State == TreeEntry::StridedVectorize) &&
5203 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5204 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5206 return TE.ReorderIndices;
5207 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5208 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5209 Value *V1 = TE.Scalars[I1];
5210 Value *V2 = TE.Scalars[I2];
5211 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5217 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5218 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5219 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5220 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5227 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5228 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5229 if (EE1->getOperand(0) != EE2->getOperand(0))
5235 auto IsIdentityOrder = [](
const OrdersType &Order) {
5236 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5241 if (!TE.ReorderIndices.empty())
5242 return TE.ReorderIndices;
5245 std::iota(Phis.begin(), Phis.end(), 0);
5247 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5250 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5251 ResOrder[Id] = PhiToId[Phis[Id]];
5252 if (IsIdentityOrder(ResOrder))
5253 return std::nullopt;
5254 return std::move(ResOrder);
5256 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5259 if ((TE.getOpcode() == Instruction::ExtractElement ||
5260 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5261 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5263 auto *EE = dyn_cast<ExtractElementInst>(V);
5264 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5269 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5271 if (Reuse || !CurrentOrder.
empty())
5272 return std::move(CurrentOrder);
5280 int Sz = TE.Scalars.size();
5282 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5284 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5285 if (It == TE.Scalars.begin())
5288 if (It != TE.Scalars.end()) {
5290 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5305 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5308 return std::move(Order);
5313 return std::nullopt;
5314 if (TE.Scalars.size() >= 4)
5318 return CurrentOrder;
5320 return std::nullopt;
5330 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5332 if (Cluster != FirstCluster)
5338void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5341 const unsigned Sz =
TE.Scalars.size();
5343 if (!
TE.isGather() ||
5350 addMask(NewMask,
TE.ReuseShuffleIndices);
5352 TE.ReorderIndices.clear();
5359 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5360 *
End =
TE.ReuseShuffleIndices.end();
5361 It !=
End; std::advance(It, Sz))
5362 std::iota(It, std::next(It, Sz), 0);
5368 "Expected same size of orders");
5369 unsigned Sz = Order.
size();
5371 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5372 if (Order[
Idx] != Sz)
5373 UsedIndices.
set(Order[
Idx]);
5375 if (SecondaryOrder.
empty()) {
5376 for (
unsigned Idx : seq<unsigned>(0, Sz))
5377 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5380 for (
unsigned Idx : seq<unsigned>(0, Sz))
5381 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5382 !UsedIndices.
test(SecondaryOrder[
Idx]))
5383 Order[
Idx] = SecondaryOrder[
Idx];
5403 ExternalUserReorderMap;
5408 const std::unique_ptr<TreeEntry> &TE) {
5411 findExternalStoreUsersReorderIndices(TE.get());
5412 if (!ExternalUserReorderIndices.
empty()) {
5413 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5415 std::move(ExternalUserReorderIndices));
5421 if (TE->isAltShuffle()) {
5424 unsigned Opcode0 = TE->getOpcode();
5425 unsigned Opcode1 = TE->getAltOpcode();
5428 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5429 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5435 if (std::optional<OrdersType> CurrentOrder =
5445 const TreeEntry *UserTE = TE.get();
5447 if (UserTE->UserTreeIndices.size() != 1)
5450 return EI.UserTE->State == TreeEntry::Vectorize &&
5451 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5454 UserTE = UserTE->UserTreeIndices.back().UserTE;
5457 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5458 if (!(TE->State == TreeEntry::Vectorize ||
5459 TE->State == TreeEntry::StridedVectorize) ||
5460 !TE->ReuseShuffleIndices.empty())
5461 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5462 if (TE->State == TreeEntry::Vectorize &&
5463 TE->getOpcode() == Instruction::PHI)
5464 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5469 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5471 auto It = VFToOrderedEntries.
find(VF);
5472 if (It == VFToOrderedEntries.
end())
5484 for (
const TreeEntry *OpTE : OrderedEntries) {
5487 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5490 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5492 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5493 auto It = GathersToOrders.find(OpTE);
5494 if (It != GathersToOrders.end())
5497 if (OpTE->isAltShuffle()) {
5498 auto It = AltShufflesToOrders.find(OpTE);
5499 if (It != AltShufflesToOrders.end())
5502 if (OpTE->State == TreeEntry::Vectorize &&
5503 OpTE->getOpcode() == Instruction::PHI) {
5504 auto It = PhisToOrders.
find(OpTE);
5505 if (It != PhisToOrders.
end())
5508 return OpTE->ReorderIndices;
5511 auto It = ExternalUserReorderMap.
find(OpTE);
5512 if (It != ExternalUserReorderMap.
end()) {
5513 const auto &ExternalUserReorderIndices = It->second;
5517 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5518 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5519 ExternalUserReorderIndices.size();
5521 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5522 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5529 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5530 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5533 unsigned E = Order.size();
5536 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5539 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5541 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5544 if (OrdersUses.empty())
5547 const unsigned Sz = Order.size();
5548 for (
unsigned Idx : seq<unsigned>(0, Sz))
5549 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5554 unsigned IdentityCnt = 0;
5555 unsigned FilledIdentityCnt = 0;
5557 for (
auto &Pair : OrdersUses) {
5558 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5559 if (!Pair.first.empty())
5560 FilledIdentityCnt += Pair.second;
5561 IdentityCnt += Pair.second;
5566 unsigned Cnt = IdentityCnt;
5567 for (
auto &Pair : OrdersUses) {
5571 if (Cnt < Pair.second ||
5572 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5573 Cnt == Pair.second && !BestOrder.
empty() &&
5574 IsIdentityOrder(BestOrder))) {
5576 BestOrder = Pair.first;
5583 if (IsIdentityOrder(BestOrder))
5589 unsigned E = BestOrder.
size();
5591 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5594 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5596 if (TE->Scalars.size() != VF) {
5597 if (TE->ReuseShuffleIndices.size() == VF) {
5603 return EI.UserTE->Scalars.size() == VF ||
5604 EI.UserTE->Scalars.size() ==
5607 "All users must be of VF size.");
5610 reorderNodeWithReuses(*TE, Mask);
5614 if ((TE->State == TreeEntry::Vectorize ||
5615 TE->State == TreeEntry::StridedVectorize) &&
5618 !TE->isAltShuffle()) {
5622 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5623 TE->reorderOperands(Mask);
5626 TE->reorderOperands(Mask);
5627 assert(TE->ReorderIndices.empty() &&
5628 "Expected empty reorder sequence.");
5631 if (!TE->ReuseShuffleIndices.empty()) {
5638 addMask(NewReuses, TE->ReuseShuffleIndices);
5639 TE->ReuseShuffleIndices.swap(NewReuses);
5645bool BoUpSLP::canReorderOperands(
5646 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5650 if (UserTE->isNonPowOf2Vec())
5653 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5654 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5655 return OpData.first ==
I &&
5656 (OpData.second->State == TreeEntry::Vectorize ||
5657 OpData.second->State == TreeEntry::StridedVectorize);
5660 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5662 if (
any_of(TE->UserTreeIndices,
5663 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5667 Edges.emplace_back(
I, TE);
5673 if (TE->State != TreeEntry::Vectorize &&
5674 TE->State != TreeEntry::StridedVectorize &&
5675 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5679 TreeEntry *
Gather =
nullptr;
5681 [&
Gather, UserTE,
I](TreeEntry *TE) {
5682 assert(TE->State != TreeEntry::Vectorize &&
5683 TE->State != TreeEntry::StridedVectorize &&
5684 "Only non-vectorized nodes are expected.");
5685 if (
any_of(TE->UserTreeIndices,
5686 [UserTE,
I](
const EdgeInfo &EI) {
5687 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5689 assert(TE->isSame(UserTE->getOperand(
I)) &&
5690 "Operand entry does not match operands.");
5711 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5712 if (TE->State != TreeEntry::Vectorize &&
5713 TE->State != TreeEntry::StridedVectorize)
5715 if (std::optional<OrdersType> CurrentOrder =
5717 OrderedEntries.
insert(TE.get());
5718 if (!(TE->State == TreeEntry::Vectorize ||
5719 TE->State == TreeEntry::StridedVectorize) ||
5720 !TE->ReuseShuffleIndices.empty())
5721 GathersToOrders.
insert(TE.get());
5730 while (!OrderedEntries.
empty()) {
5735 for (TreeEntry *TE : OrderedEntries) {
5736 if (!(TE->State == TreeEntry::Vectorize ||
5737 TE->State == TreeEntry::StridedVectorize ||
5738 (TE->isGather() && GathersToOrders.
contains(TE))) ||
5739 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5742 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5744 !Visited.
insert(TE).second) {
5750 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5751 TreeEntry *UserTE = EI.
UserTE;
5752 auto It =
Users.find(UserTE);
5753 if (It ==
Users.end())
5754 It =
Users.insert({UserTE, {}}).first;
5755 It->second.emplace_back(EI.
EdgeIdx, TE);
5759 for (TreeEntry *TE : Filtered)
5760 OrderedEntries.remove(TE);
5762 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5764 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5765 return Data1.first->Idx > Data2.first->Idx;
5767 for (
auto &
Data : UsersVec) {
5770 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5772 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5773 OrderedEntries.remove(
Op.second);
5786 for (
const auto &
Op :
Data.second) {
5787 TreeEntry *OpTE =
Op.second;
5788 if (!VisitedOps.
insert(OpTE).second)
5790 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5792 const auto Order = [&]() ->
const OrdersType {
5793 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5796 return OpTE->ReorderIndices;
5800 if (Order.size() == 1)
5803 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5804 return P.second == OpTE;
5807 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5808 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5811 unsigned E = Order.size();
5814 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5817 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5820 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5822 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5823 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5825 if (TE->isNonPowOf2Vec())
5827 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5828 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5829 (IgnoreReorder && TE->Idx == 0))
5831 if (TE->isGather()) {
5840 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5841 TreeEntry *UserTE = EI.
UserTE;
5842 if (!VisitedUsers.
insert(UserTE).second)
5847 if (AllowsReordering(UserTE))
5855 if (
static_cast<unsigned>(
count_if(
5856 Ops, [UserTE, &AllowsReordering](
5857 const std::pair<unsigned, TreeEntry *> &
Op) {
5858 return AllowsReordering(
Op.second) &&
5861 return EI.UserTE == UserTE;
5863 })) <= Ops.
size() / 2)
5864 ++Res.first->second;
5867 if (OrdersUses.empty()) {
5868 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5869 OrderedEntries.remove(
Op.second);
5873 const unsigned Sz = Order.size();
5874 for (
unsigned Idx : seq<unsigned>(0, Sz))
5875 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5880 unsigned IdentityCnt = 0;
5881 unsigned VF =
Data.second.front().second->getVectorFactor();
5883 for (
auto &Pair : OrdersUses) {
5884 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5885 IdentityCnt += Pair.second;
5890 unsigned Cnt = IdentityCnt;
5891 for (
auto &Pair : OrdersUses) {
5895 if (Cnt < Pair.second) {
5897 BestOrder = Pair.first;
5904 if (IsIdentityOrder(BestOrder)) {
5905 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5906 OrderedEntries.remove(
Op.second);
5915 unsigned E = BestOrder.
size();
5917 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5919 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5920 TreeEntry *TE =
Op.second;
5921 OrderedEntries.remove(TE);
5922 if (!VisitedOps.
insert(TE).second)
5924 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5925 reorderNodeWithReuses(*TE, Mask);
5929 if (TE->State != TreeEntry::Vectorize &&
5930 TE->State != TreeEntry::StridedVectorize &&
5931 (TE->State != TreeEntry::ScatterVectorize ||
5932 TE->ReorderIndices.empty()))
5934 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5935 TE->ReorderIndices.empty()) &&
5936 "Non-matching sizes of user/operand entries.");
5938 if (IgnoreReorder && TE == VectorizableTree.front().get())
5939 IgnoreReorder =
false;
5942 for (TreeEntry *
Gather : GatherOps) {
5944 "Unexpected reordering of gathers.");
5945 if (!
Gather->ReuseShuffleIndices.empty()) {
5951 OrderedEntries.remove(
Gather);
5955 if (
Data.first->State != TreeEntry::Vectorize ||
5956 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5957 Data.first->getMainOp()) ||
5958 Data.first->isAltShuffle())
5959 Data.first->reorderOperands(Mask);
5960 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5961 Data.first->isAltShuffle() ||
5962 Data.first->State == TreeEntry::StridedVectorize) {
5966 if (
Data.first->ReuseShuffleIndices.empty() &&
5967 !
Data.first->ReorderIndices.empty() &&
5968 !
Data.first->isAltShuffle()) {
5971 OrderedEntries.insert(
Data.first);
5979 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5980 VectorizableTree.front()->ReuseShuffleIndices.empty())
5981 VectorizableTree.front()->ReorderIndices.clear();
5988 for (
auto &TEPtr : VectorizableTree) {
5989 TreeEntry *Entry = TEPtr.get();
5992 if (Entry->isGather())
5996 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5997 Value *Scalar = Entry->Scalars[Lane];
5998 if (!isa<Instruction>(Scalar))
6001 auto It = ScalarToExtUses.
find(Scalar);
6002 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6006 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
6007 if (ExtI != ExternallyUsedValues.
end()) {
6008 int FoundLane = Entry->findLaneForValue(Scalar);
6009 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6010 << FoundLane <<
" from " << *Scalar <<
".\n");
6011 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6012 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6015 for (
User *U : Scalar->users()) {
6023 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6027 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6031 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6033 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
6034 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6036 assert(!UseEntry->isGather() &&
"Bad state");
6040 if (It != ScalarToExtUses.
end()) {
6041 ExternalUses[It->second].User =
nullptr;
6046 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6048 int FoundLane = Entry->findLaneForValue(Scalar);
6050 <<
" from lane " << FoundLane <<
" from " << *Scalar
6052 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6053 ExternalUses.emplace_back(Scalar, U, FoundLane);
6062BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6064 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6065 Value *V = TE->Scalars[Lane];
6067 if (isa<ConstantData>(V))
6074 for (
User *U : V->users()) {
6075 auto *SI = dyn_cast<StoreInst>(U);
6078 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6082 if (getTreeEntry(U))
6086 auto &StoresVec = PtrToStoresMap[
Ptr];
6089 if (StoresVec.size() > Lane)
6092 if (!StoresVec.empty() &&
6093 SI->getParent() != StoresVec.back()->getParent())
6096 if (!StoresVec.empty() &&
6097 SI->getValueOperand()->getType() !=
6098 StoresVec.back()->getValueOperand()->getType())
6100 StoresVec.push_back(SI);
6103 return PtrToStoresMap;
6107 OrdersType &ReorderIndices)
const {
6115 StoreOffsetVec[0] = {S0, 0};
6118 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6120 std::optional<int> Diff =
6122 SI->getPointerOperand(), *
DL, *SE,
6127 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
6132 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
6133 const std::pair<StoreInst *, int> &Pair2) {
6134 int Offset1 = Pair1.second;
6135 int Offset2 = Pair2.second;
6136 return Offset1 < Offset2;
6140 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
6141 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
6146 ReorderIndices.reserve(StoresVec.
size());
6149 [SI](
const std::pair<StoreInst *, int> &Pair) {
6150 return Pair.first ==
SI;
6152 StoreOffsetVec.begin();
6153 ReorderIndices.push_back(
Idx);
6158 auto IsIdentityOrder = [](
const OrdersType &Order) {
6159 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
6164 if (IsIdentityOrder(ReorderIndices))
6165 ReorderIndices.clear();
6172 for (
unsigned Idx : Order)
6179BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6180 unsigned NumLanes =
TE->Scalars.size();
6183 collectUserStores(TE);
6192 for (
const auto &Pair : PtrToStoresMap) {
6193 auto &StoresVec = Pair.second;
6195 if (StoresVec.size() != NumLanes)
6200 if (!canFormVector(StoresVec, ReorderIndices))
6205 ExternalReorderIndices.
push_back(ReorderIndices);
6207 return ExternalReorderIndices;
6213 UserIgnoreList = &UserIgnoreLst;
6216 buildTree_rec(Roots, 0,
EdgeInfo());
6223 buildTree_rec(Roots, 0,
EdgeInfo());
6230 Value *NeedsScheduling =
nullptr;
6231 for (
Value *V : VL) {
6234 if (!NeedsScheduling) {
6235 NeedsScheduling = V;
6240 return NeedsScheduling;
6251 bool AllowAlternate) {
6255 if (
auto *LI = dyn_cast<LoadInst>(V)) {
6258 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
6263 if (isa<ExtractElementInst, UndefValue>(V))
6265 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
6267 !isa<UndefValue>(EI->getIndexOperand()))
6270 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
6273 if ((isa<BinaryOperator, CastInst>(
I)) &&
6283 : cast<CastInst>(
I)->getOperand(0)->getType()));
6285 if (isa<CastInst>(
I)) {
6286 std::pair<size_t, size_t> OpVals =
6292 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
6294 if (CI->isCommutative())
6300 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
6314 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
6315 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
6316 SubKey =
hash_value(Gep->getPointerOperand());
6320 !isa<ConstantInt>(
I->getOperand(1))) {
6328 return std::make_pair(Key, SubKey);
6338bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
6340 unsigned Opcode0 = S.getOpcode();
6341 unsigned Opcode1 = S.getAltOpcode();
6345 Opcode0, Opcode1, OpcodeMask))
6348 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6352 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
6356 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6362 switch (Res.value_or(0)) {
6377 constexpr unsigned NumAltInsts = 3;
6378 unsigned NonInstCnt = 0;
6381 unsigned UndefCnt = 0;
6383 unsigned ExtraShuffleInsts = 0;
6392 return is_contained(Operands.back(), V);
6395 ++ExtraShuffleInsts;
6412 if (isa<Constant, ExtractElementInst>(V) ||
6413 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
6414 if (isa<UndefValue>(V))
6420 if (!Res.second && Res.first->second == 1)
6421 ++ExtraShuffleInsts;
6422 ++Res.first->getSecond();
6423 if (
auto *
I = dyn_cast<Instruction>(V))
6424 UniqueOpcodes.
insert(
I->getOpcode());
6425 else if (Res.second)
6428 return none_of(Uniques, [&](
const auto &
P) {
6429 return P.first->hasNUsesOrMore(
P.second + 1) &&
6431 return getTreeEntry(U) || Uniques.contains(U);
6440 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6441 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
6442 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6445BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6448 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
6450 unsigned ShuffleOrOp =
6451 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
6452 auto *VL0 = cast<Instruction>(S.OpValue);
6453 switch (ShuffleOrOp) {
6454 case Instruction::PHI: {
6457 return TreeEntry::NeedToGather;
6460 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
6462 if (Term &&
Term->isTerminator()) {
6464 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
6465 return TreeEntry::NeedToGather;
6469 return TreeEntry::Vectorize;
6471 case Instruction::ExtractValue:
6472 case Instruction::ExtractElement: {
6473 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6476 return TreeEntry::NeedToGather;
6477 if (Reuse || !CurrentOrder.empty())
6478 return TreeEntry::Vectorize;
6480 return TreeEntry::NeedToGather;
6482 case Instruction::InsertElement: {
6486 for (
Value *V : VL) {
6487 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6489 "Non-constant or undef index?");
6493 return !SourceVectors.contains(V);
6496 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6497 "different source vectors.\n");
6498 return TreeEntry::NeedToGather;
6503 return SourceVectors.contains(V) && !
V->hasOneUse();
6506 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6507 "multiple uses.\n");
6508 return TreeEntry::NeedToGather;
6511 return TreeEntry::Vectorize;
6513 case Instruction::Load: {
6522 return TreeEntry::Vectorize;
6524 return TreeEntry::ScatterVectorize;
6526 return TreeEntry::StridedVectorize;
6529 Type *ScalarTy = VL0->getType();
6530 if (
DL->getTypeSizeInBits(ScalarTy) !=
6531 DL->getTypeAllocSizeInBits(ScalarTy))
6532 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6534 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6539 return TreeEntry::NeedToGather;
6543 case Instruction::ZExt:
6544 case Instruction::SExt:
6545 case Instruction::FPToUI:
6546 case Instruction::FPToSI:
6547 case Instruction::FPExt:
6548 case Instruction::PtrToInt:
6549 case Instruction::IntToPtr:
6550 case Instruction::SIToFP:
6551 case Instruction::UIToFP:
6552 case Instruction::Trunc:
6553 case Instruction::FPTrunc:
6554 case Instruction::BitCast: {
6555 Type *SrcTy = VL0->getOperand(0)->getType();
6556 for (
Value *V : VL) {
6557 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6560 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6561 return TreeEntry::NeedToGather;
6564 return TreeEntry::Vectorize;
6566 case Instruction::ICmp:
6567 case Instruction::FCmp: {
6571 Type *ComparedTy = VL0->getOperand(0)->getType();
6572 for (
Value *V : VL) {
6574 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6575 Cmp->getOperand(0)->getType() != ComparedTy) {
6576 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6577 return TreeEntry::NeedToGather;
6580 return TreeEntry::Vectorize;
6582 case Instruction::Select:
6583 case Instruction::FNeg:
6584 case Instruction::Add:
6585 case Instruction::FAdd:
6586 case Instruction::Sub:
6587 case Instruction::FSub:
6588 case Instruction::Mul:
6589 case Instruction::FMul:
6590 case Instruction::UDiv:
6591 case Instruction::SDiv:
6592 case Instruction::FDiv:
6593 case Instruction::URem:
6594 case Instruction::SRem:
6595 case Instruction::FRem:
6596 case Instruction::Shl:
6597 case Instruction::LShr:
6598 case Instruction::AShr:
6599 case Instruction::And:
6600 case Instruction::Or:
6601 case Instruction::Xor:
6602 case Instruction::Freeze:
6603 return TreeEntry::Vectorize;
6604 case Instruction::GetElementPtr: {
6606 for (
Value *V : VL) {
6607 auto *
I = dyn_cast<GetElementPtrInst>(V);
6610 if (
I->getNumOperands() != 2) {
6611 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6612 return TreeEntry::NeedToGather;
6618 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6619 for (
Value *V : VL) {
6620 auto *
GEP = dyn_cast<GEPOperator>(V);
6623 Type *CurTy =
GEP->getSourceElementType();
6625 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6626 return TreeEntry::NeedToGather;
6631 Type *Ty1 = VL0->getOperand(1)->getType();
6632 for (
Value *V : VL) {
6633 auto *
I = dyn_cast<GetElementPtrInst>(V);
6636 auto *
Op =
I->getOperand(1);
6637 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6638 (
Op->getType() != Ty1 &&
6639 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6640 Op->getType()->getScalarSizeInBits() >
6641 DL->getIndexSizeInBits(
6642 V->getType()->getPointerAddressSpace())))) {
6644 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6645 return TreeEntry::NeedToGather;
6649 return TreeEntry::Vectorize;
6651 case Instruction::Store: {
6653 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6656 if (
DL->getTypeSizeInBits(ScalarTy) !=
6657 DL->getTypeAllocSizeInBits(ScalarTy)) {
6658 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6659 return TreeEntry::NeedToGather;
6663 for (
Value *V : VL) {
6664 auto *
SI = cast<StoreInst>(V);
6665 if (!
SI->isSimple()) {
6667 return TreeEntry::NeedToGather;
6676 if (CurrentOrder.empty()) {
6677 Ptr0 = PointerOps.
front();
6678 PtrN = PointerOps.
back();
6680 Ptr0 = PointerOps[CurrentOrder.front()];
6681 PtrN = PointerOps[CurrentOrder.back()];
6683 std::optional<int> Dist =
6686 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6687 return TreeEntry::Vectorize;
6691 return TreeEntry::NeedToGather;
6693 case Instruction::Call: {
6696 CallInst *CI = cast<CallInst>(VL0);
6707 return TreeEntry::NeedToGather;
6712 for (
unsigned J = 0; J != NumArgs; ++J)
6715 for (
Value *V : VL) {
6716 CallInst *CI2 = dyn_cast<CallInst>(V);
6722 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6724 return TreeEntry::NeedToGather;
6728 for (
unsigned J = 0; J != NumArgs; ++J) {
6731 if (ScalarArgs[J] != A1J) {
6733 <<
"SLP: mismatched arguments in call:" << *CI
6734 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6735 return TreeEntry::NeedToGather;
6744 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6745 <<
"!=" << *V <<
'\n');
6746 return TreeEntry::NeedToGather;
6750 return TreeEntry::Vectorize;
6752 case Instruction::ShuffleVector: {
6753 if (!S.isAltShuffle()) {
6756 return TreeEntry::Vectorize;
6759 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6760 return TreeEntry::NeedToGather;
6765 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6766 "the whole alt sequence is not profitable.\n");
6767 return TreeEntry::NeedToGather;
6770 return TreeEntry::Vectorize;
6774 return TreeEntry::NeedToGather;
6788 PHIHandler() =
delete;
6790 : DT(DT), Main(Main), Phis(Phis),
6791 Operands(Main->getNumIncomingValues(),
6793 void buildOperands() {
6794 constexpr unsigned FastLimit = 4;
6804 auto *
P = cast<PHINode>(V);
6805 if (
P->getIncomingBlock(
I) == InBB)
6820 Blocks.try_emplace(InBB).first->second.push_back(
I);
6823 auto *
P = cast<PHINode>(V);
6824 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
6832 auto It =
Blocks.find(InBB);
6838 for (
const auto &
P :
Blocks) {
6839 if (
P.getSecond().size() <= 1)
6841 unsigned BasicI =
P.getSecond().front();
6844 [&](
const auto &Data) {
6845 return !Data.value() ||
6846 Data.value() ==
Operands[BasicI][Data.index()];
6848 "Expected empty operands list.");
6858 const EdgeInfo &UserTreeIdx) {
6864 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6865 bool DoNotFail =
false) {
6868 for (
Value *V : VL) {
6875 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6880 size_t NumUniqueScalarValues = UniqueValues.
size();
6881 if (NumUniqueScalarValues == VL.size()) {
6882 ReuseShuffleIndices.
clear();
6885 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6886 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6887 "for nodes with padding.\n");
6888 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6892 if (NumUniqueScalarValues <= 1 ||
6893 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6895 return isa<UndefValue>(V) ||
6898 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6899 if (DoNotFail && UniquePositions.size() > 1 &&
6900 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6902 return isa<ExtractElementInst>(V) ||
6903 areAllUsersVectorized(cast<Instruction>(V),
6907 if (PWSz == VL.size()) {
6908 ReuseShuffleIndices.
clear();
6910 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6911 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6912 UniqueValues.
back());
6913 VL = NonUniqueValueVL;
6918 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6929 if (!EphValues.
empty()) {
6930 for (
Value *V : VL) {
6931 if (EphValues.
count(V)) {
6933 <<
") is ephemeral.\n");
6934 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6944 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6949 cast<Instruction>(
I)->getOpcode() ==
6950 cast<Instruction>(S.MainOp)->getOpcode();
6952 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6953 if (TryToFindDuplicates(S))
6954 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6955 ReuseShuffleIndices);
6960 if (S.getOpcode() == Instruction::ExtractElement &&
6961 isa<ScalableVectorType>(
6962 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6963 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6964 if (TryToFindDuplicates(S))
6965 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6966 ReuseShuffleIndices);
6971 if (!
SLPReVec && S.OpValue->getType()->isVectorTy() &&
6972 !isa<InsertElementInst>(S.OpValue)) {
6974 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6978 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6979 if (!
SLPReVec &&
SI->getValueOperand()->getType()->isVectorTy()) {
6980 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6981 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6990 auto &&NotProfitableForVectorization = [&S,
this,
6992 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
7001 for (
Value *V : VL) {
7002 auto *
I = cast<Instruction>(V);
7004 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
7008 if ((IsCommutative &&
7009 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
7011 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
7013 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
7015 auto *
I1 = cast<Instruction>(VL.front());
7016 auto *I2 = cast<Instruction>(VL.back());
7019 I2->getOperand(
Op));
7020 if (
static_cast<unsigned>(
count_if(
7021 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
7023 })) >= S.MainOp->getNumOperands() / 2)
7025 if (S.MainOp->getNumOperands() > 2)
7027 if (IsCommutative) {
7032 I2->getOperand((
Op + 1) % E));
7034 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
7043 bool IsScatterVectorizeUserTE =
7044 UserTreeIdx.UserTE &&
7045 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
7046 bool AreAllSameBlock = S.getOpcode() &&
allSameBlock(VL);
7047 bool AreScatterAllGEPSameBlock =
7048 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
7052 auto *
I = dyn_cast<GetElementPtrInst>(V);
7056 BB =
I->getParent();
7057 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
7060 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
7062 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
7064 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
7067 NotProfitableForVectorization(VL)) {
7068 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
7069 if (TryToFindDuplicates(S))
7070 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7071 ReuseShuffleIndices);
7079 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
7080 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
7081 if (!E->isSame(VL)) {
7082 auto It = MultiNodeScalars.
find(S.OpValue);
7083 if (It != MultiNodeScalars.
end()) {
7084 auto *TEIt =
find_if(It->getSecond(),
7085 [&](TreeEntry *ME) { return ME->isSame(VL); });
7086 if (TEIt != It->getSecond().end())
7096 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
7097 if (TryToFindDuplicates(S))
7098 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7099 ReuseShuffleIndices);
7103 Nodes.
insert(getTreeEntry(S.OpValue));
7104 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.OpValue))
7107 if (
any_of(Nodes, [&](
const TreeEntry *E) {
7108 return all_of(E->Scalars,
7109 [&](
Value *V) { return Values.contains(V); });
7112 if (TryToFindDuplicates(S))
7113 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7114 ReuseShuffleIndices);
7120 E->UserTreeIndices.push_back(UserTreeIdx);
7121 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
7128 for (
Value *V : VL) {
7129 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
7132 if (getTreeEntry(V)) {
7134 <<
") is already in tree.\n");
7135 if (TryToFindDuplicates(S))
7136 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7137 ReuseShuffleIndices);
7143 if (UserIgnoreList && !UserIgnoreList->empty()) {
7144 for (
Value *V : VL) {
7145 if (UserIgnoreList && UserIgnoreList->contains(V)) {
7146 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
7147 if (TryToFindDuplicates(S))
7148 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7149 ReuseShuffleIndices);
7157 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
7158 assert(S.OpValue->getType()->isPointerTy() &&
7159 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
7160 "Expected pointers only.");
7162 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
7163 assert(It != VL.end() &&
"Expected at least one GEP.");
7169 auto *VL0 = cast<Instruction>(S.OpValue);
7176 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7185 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7190 if (!TryToFindDuplicates(S,
true))
7196 TreeEntry::EntryState State = getScalarsVectorizationState(
7197 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7198 if (State == TreeEntry::NeedToGather) {
7199 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7200 ReuseShuffleIndices);
7204 auto &BSRef = BlocksSchedules[BB];
7206 BSRef = std::make_unique<BlockScheduling>(BB);
7208 BlockScheduling &BS = *BSRef;
7210 std::optional<ScheduleData *> Bundle =
7211 BS.tryScheduleBundle(UniqueValues,
this, S);
7212#ifdef EXPENSIVE_CHECKS
7217 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
7218 assert((!BS.getScheduleData(VL0) ||
7219 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7220 "tryScheduleBundle should cancelScheduling on failure");
7221 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7222 ReuseShuffleIndices);
7223 NonScheduledFirst.insert(VL.front());
7226 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
7228 unsigned ShuffleOrOp = S.isAltShuffle() ?
7229 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
7230 switch (ShuffleOrOp) {
7231 case Instruction::PHI: {
7232 auto *PH = cast<PHINode>(VL0);
7235 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7239 PHIHandler Handler(*DT, PH, VL);
7240 Handler.buildOperands();
7241 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7242 TE->setOperand(
I, Handler.getOperands(
I));
7243 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
7244 buildTree_rec(Handler.getOperands(
I),
Depth + 1, {TE, I});
7247 case Instruction::ExtractValue:
7248 case Instruction::ExtractElement: {
7249 if (CurrentOrder.empty()) {
7250 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
7253 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
7255 for (
unsigned Idx : CurrentOrder)
7263 newTreeEntry(VL, Bundle , S, UserTreeIdx,
7264 ReuseShuffleIndices, CurrentOrder);
7268 Op0.
assign(VL.size(), VL0->getOperand(0));
7269 VectorizableTree.back()->setOperand(0, Op0);
7272 case Instruction::InsertElement: {
7273 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
7275 auto OrdCompare = [](
const std::pair<int, int> &P1,
7276 const std::pair<int, int> &P2) {
7277 return P1.first > P2.first;
7280 decltype(OrdCompare)>
7281 Indices(OrdCompare);
7282 for (
int I = 0, E = VL.size();
I < E; ++
I) {
7284 Indices.emplace(
Idx,
I);
7286 OrdersType CurrentOrder(VL.size(), VL.size());
7287 bool IsIdentity =
true;
7288 for (
int I = 0, E = VL.size();
I < E; ++
I) {
7289 CurrentOrder[Indices.top().second] =
I;
7290 IsIdentity &= Indices.top().second ==
I;
7294 CurrentOrder.clear();
7295 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7296 std::nullopt, CurrentOrder);
7299 TE->setOperandsInOrder();
7300 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
7303 case Instruction::Load: {
7310 TreeEntry *
TE =
nullptr;
7313 case TreeEntry::Vectorize:
7314 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7315 ReuseShuffleIndices, CurrentOrder);
7316 if (CurrentOrder.empty())
7320 TE->setOperandsInOrder();
7322 case TreeEntry::StridedVectorize:
7324 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
7325 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
7326 TE->setOperandsInOrder();
7329 case TreeEntry::ScatterVectorize:
7331 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
7332 UserTreeIdx, ReuseShuffleIndices);
7333 TE->setOperandsInOrder();
7334 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
7335 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
7337 case TreeEntry::CombinedVectorize:
7338 case TreeEntry::NeedToGather:
7343 case Instruction::ZExt:
7344 case Instruction::SExt:
7345 case Instruction::FPToUI:
7346 case Instruction::FPToSI:
7347 case Instruction::FPExt:
7348 case Instruction::PtrToInt:
7349 case Instruction::IntToPtr:
7350 case Instruction::SIToFP:
7351 case Instruction::UIToFP:
7352 case Instruction::Trunc:
7353 case Instruction::FPTrunc:
7354 case Instruction::BitCast: {
7355 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7356 std::make_pair(std::numeric_limits<unsigned>::min(),
7357 std::numeric_limits<unsigned>::max()));
7358 if (ShuffleOrOp == Instruction::ZExt ||
7359 ShuffleOrOp == Instruction::SExt) {
7360 CastMaxMinBWSizes = std::make_pair(
7366 }
else if (ShuffleOrOp == Instruction::Trunc) {
7367 CastMaxMinBWSizes = std::make_pair(
7373 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7374 }
else if (ShuffleOrOp == Instruction::SIToFP ||
7375 ShuffleOrOp == Instruction::UIToFP) {
7376 unsigned NumSignBits =
7378 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7380 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
7382 if (NumSignBits * 2 >=
7384 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7386 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7387 ReuseShuffleIndices);
7390 TE->setOperandsInOrder();
7391 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7392 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7395 case Instruction::ICmp:
7396 case Instruction::FCmp: {
7399 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7400 ReuseShuffleIndices);
7408 "Commutative Predicate mismatch");
7409 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7412 for (
Value *V : VL) {
7413 auto *
Cmp = cast<CmpInst>(V);
7416 if (
Cmp->getPredicate() != P0)
7418 Left.push_back(LHS);
7419 Right.push_back(RHS);
7426 if (ShuffleOrOp == Instruction::ICmp) {
7427 unsigned NumSignBits0 =
7429 if (NumSignBits0 * 2 >=
7431 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
7432 unsigned NumSignBits1 =
7434 if (NumSignBits1 * 2 >=
7436 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
7440 case Instruction::Select:
7441 case Instruction::FNeg:
7442 case Instruction::Add:
7443 case Instruction::FAdd:
7444 case Instruction::Sub:
7445 case Instruction::FSub:
7446 case Instruction::Mul:
7447 case Instruction::FMul:
7448 case Instruction::UDiv:
7449 case Instruction::SDiv:
7450 case Instruction::FDiv:
7451 case Instruction::URem:
7452 case Instruction::SRem:
7453 case Instruction::FRem:
7454 case Instruction::Shl:
7455 case Instruction::LShr:
7456 case Instruction::AShr:
7457 case Instruction::And:
7458 case Instruction::Or:
7459 case Instruction::Xor:
7460 case Instruction::Freeze: {
7461 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7462 ReuseShuffleIndices);
7469 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7477 TE->setOperandsInOrder();
7478 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7479 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7482 case Instruction::GetElementPtr: {
7483 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7484 ReuseShuffleIndices);
7488 for (
Value *V : VL) {
7489 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7494 Operands.front().push_back(
GEP->getPointerOperand());
7503 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7505 [VL0Ty, IndexIdx](
Value *V) {
7506 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7509 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
7513 ->getPointerOperandType()
7516 for (
Value *V : VL) {
7517 auto *
I = dyn_cast<GetElementPtrInst>(V);
7520 ConstantInt::get(Ty, 0,
false));
7523 auto *
Op =
I->getOperand(IndexIdx);
7524 auto *CI = dyn_cast<ConstantInt>(
Op);
7529 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7533 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7537 case Instruction::Store: {
7538 bool Consecutive = CurrentOrder.empty();
7541 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7542 ReuseShuffleIndices, CurrentOrder);
7543 TE->setOperandsInOrder();
7544 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
7548 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7551 case Instruction::Call: {
7554 CallInst *CI = cast<CallInst>(VL0);
7557 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7558 ReuseShuffleIndices);
7563 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7567 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7571 for (
Value *V : VL) {
7572 auto *CI2 = cast<CallInst>(V);
7579 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7586 TE->setOperandsInOrder();
7587 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7594 for (
Value *V : VL) {
7595 auto *CI2 = cast<CallInst>(V);
7602 case Instruction::ShuffleVector: {
7603 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7604 ReuseShuffleIndices);
7608 auto *CI = dyn_cast<CmpInst>(VL0);
7609 if (isa<BinaryOperator>(VL0) || CI) {
7612 return cast<CmpInst>(V)->isCommutative();
7614 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7616 auto *MainCI = cast<CmpInst>(S.MainOp);
7617 auto *AltCI = cast<CmpInst>(S.AltOp);
7621 "Expected different main/alternate predicates.");
7624 for (
Value *V : VL) {
7625 auto *
Cmp = cast<CmpInst>(V);
7636 Left.push_back(LHS);
7637 Right.push_back(RHS);
7647 TE->setOperandsInOrder();
7648 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7649 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7662 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7663 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7665 for (
const auto *Ty : ST->elements())
7666 if (Ty != *ST->element_begin())
7668 N *= ST->getNumElements();
7669 EltTy = *ST->element_begin();
7670 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7671 N *= AT->getNumElements();
7672 EltTy = AT->getElementType();
7674 auto *VT = cast<FixedVectorType>(EltTy);
7675 N *= VT->getNumElements();
7676 EltTy = VT->getElementType();
7683 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7691 bool ResizeAllowed)
const {
7692 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7693 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7694 auto *E0 = cast<Instruction>(*It);
7696 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7700 Value *Vec = E0->getOperand(0);
7702 CurrentOrder.
clear();
7706 if (E0->getOpcode() == Instruction::ExtractValue) {
7711 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7715 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7718 unsigned E = VL.
size();
7719 if (!ResizeAllowed && NElts != E)
7722 unsigned MinIdx = NElts, MaxIdx = 0;
7724 auto *Inst = dyn_cast<Instruction>(V);
7727 if (Inst->getOperand(0) != Vec)
7729 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7730 if (isa<UndefValue>(EE->getIndexOperand()))
7735 const unsigned ExtIdx = *
Idx;
7736 if (ExtIdx >= NElts)
7738 Indices[
I] = ExtIdx;
7739 if (MinIdx > ExtIdx)
7741 if (MaxIdx < ExtIdx)
7744 if (MaxIdx - MinIdx + 1 > E)
7746 if (MaxIdx + 1 <= E)
7750 bool ShouldKeepOrder =
true;
7756 CurrentOrder.
assign(E, E);
7757 for (
unsigned I = 0;
I < E; ++
I) {
7760 const unsigned ExtIdx = Indices[
I] - MinIdx;
7761 if (CurrentOrder[ExtIdx] != E) {
7762 CurrentOrder.
clear();
7765 ShouldKeepOrder &= ExtIdx ==
I;
7766 CurrentOrder[ExtIdx] =
I;
7768 if (ShouldKeepOrder)
7769 CurrentOrder.
clear();
7771 return ShouldKeepOrder;
7774bool BoUpSLP::areAllUsersVectorized(
7776 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7778 return ScalarToTreeEntry.contains(U) ||
7779 isVectorLikeInstWithConstOps(U) ||
7780 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7784static std::pair<InstructionCost, InstructionCost>
7792 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7793 FMF = FPCI->getFastMathFlags();
7796 dyn_cast<IntrinsicInst>(CI));
7797 auto IntrinsicCost =
7804 auto LibCost = IntrinsicCost;
7811 return {IntrinsicCost, LibCost};
7814void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7818 unsigned Sz = Scalars.size();
7821 if (!ReorderIndices.empty())
7823 for (
unsigned I = 0;
I < Sz; ++
I) {
7825 if (!ReorderIndices.empty())
7827 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7828 if (IsAltOp(OpInst)) {
7838 if (!ReuseShuffleIndices.
empty()) {
7841 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7851 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7852 auto *AltCI = cast<CmpInst>(AltOp);
7855 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7856 auto *CI = cast<CmpInst>(
I);
7864 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7865 "CmpInst expected to match either main or alternate predicate or "
7868 return MainP !=
P && MainP != SwappedP;
7875 const auto *Op0 = Ops.
front();
7881 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7885 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7887 if (
auto *CI = dyn_cast<ConstantInt>(V))
7888 return CI->getValue().isPowerOf2();
7891 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7893 if (
auto *CI = dyn_cast<ConstantInt>(V))
7894 return CI->getValue().isNegatedPowerOf2();
7899 if (IsConstant && IsUniform)
7901 else if (IsConstant)
7915class BaseShuffleAnalysis {
7917 Type *ScalarTy =
nullptr;
7919 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
7927 unsigned getVF(
Value *V)
const {
7928 assert(V &&
"V cannot be nullptr");
7929 assert(isa<FixedVectorType>(
V->getType()) &&
7930 "V does not have FixedVectorType");
7931 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
7933 unsigned VNumElements =
7934 cast<FixedVectorType>(
V->getType())->getNumElements();
7935 assert(VNumElements > ScalarTyNumElements &&
7936 "the number of elements of V is not large enough");
7937 assert(VNumElements % ScalarTyNumElements == 0 &&
7938 "the number of elements of V is not a vectorized value");
7939 return VNumElements / ScalarTyNumElements;
7947 int Limit =
Mask.size();
7959 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7975 unsigned VF =
Mask.size();
7977 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7980 int MaskedIdx =
Mask[ExtMask[
I] % VF];
8021 bool SinglePermute) {
8025 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
8027 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
8033 if (isIdentityMask(Mask, SVTy,
false)) {
8034 if (!IdentityOp || !SinglePermute ||
8035 (isIdentityMask(Mask, SVTy,
true) &&
8037 IdentityMask.
size()))) {
8042 IdentityMask.
assign(Mask);
8062 if (SV->isZeroEltSplat()) {
8064 IdentityMask.
assign(Mask);
8066 int LocalVF =
Mask.size();
8068 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
8069 LocalVF = SVOpTy->getNumElements();
8073 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
8075 ExtMask[
Idx] = SV->getMaskValue(
I);
8085 if (!IsOp1Undef && !IsOp2Undef) {
8087 for (
int &
I : Mask) {
8090 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
8097 combineMasks(LocalVF, ShuffleMask, Mask);
8098 Mask.swap(ShuffleMask);
8100 Op = SV->getOperand(0);
8102 Op = SV->getOperand(1);
8104 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
8105 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
8110 "Expected masks of same sizes.");
8115 Mask.swap(IdentityMask);
8116 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
8117 return SinglePermute &&
8118 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
8120 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
8121 Shuffle->isZeroEltSplat() &&
8134 template <
typename T,
typename ShuffleBuilderTy>
8136 ShuffleBuilderTy &Builder) {
8137 assert(V1 &&
"Expected at least one vector value.");
8139 Builder.resizeToMatch(V1, V2);
8140 int VF =
Mask.size();
8141 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
8142 VF = FTy->getNumElements();
8149 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8152 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
8154 CombinedMask1[
I] =
Mask[
I];
8156 CombinedMask2[
I] =
Mask[
I] - VF;
8163 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
8164 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
8167 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
8168 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
8173 ExtMask1[
Idx] = SV1->getMaskValue(
I);
8176 cast<FixedVectorType>(SV1->getOperand(1)->getType())
8178 ExtMask1, UseMask::SecondArg);
8183 ExtMask2[
Idx] = SV2->getMaskValue(
I);
8186 cast<FixedVectorType>(SV2->getOperand(1)->getType())
8188 ExtMask2, UseMask::SecondArg);
8189 if (SV1->getOperand(0)->getType() ==
8190 SV2->getOperand(0)->getType() &&
8191 SV1->getOperand(0)->getType() != SV1->getType() &&
8194 Op1 = SV1->getOperand(0);
8195 Op2 = SV2->getOperand(0);
8197 int LocalVF = ShuffleMask1.size();
8198 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
8199 LocalVF = FTy->getNumElements();
8200 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
8201 CombinedMask1.swap(ShuffleMask1);
8203 LocalVF = ShuffleMask2.size();
8204 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
8205 LocalVF = FTy->getNumElements();
8206 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
8207 CombinedMask2.swap(ShuffleMask2);
8210 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
8211 Builder.resizeToMatch(Op1, Op2);
8212 VF = std::max(cast<VectorType>(Op1->
getType())
8214 .getKnownMinValue(),
8215 cast<VectorType>(Op2->
getType())
8217 .getKnownMinValue());
8218 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
8221 "Expected undefined mask element");
8222 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
8228 isa<ShuffleVectorInst>(Op1) &&
8229 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8231 return Builder.createIdentity(Op1);
8232 return Builder.createShuffleVector(
8236 if (isa<PoisonValue>(V1))
8237 return Builder.createPoison(
8238 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
8240 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
8241 assert(V1 &&
"Expected non-null value after looking through shuffles.");
8244 return Builder.createShuffleVector(V1, NewMask);
8245 return Builder.createIdentity(V1);
8251static std::pair<InstructionCost, InstructionCost>
8262 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8272 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
8276 for (
Value *V : Ptrs) {
8281 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
8286 if (!
Ptr || !
Ptr->hasOneUse())
8290 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
8296 TTI::PointersChainInfo::getKnownStride(),
8306 [](
const Value *V) {
8307 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
8308 return Ptr && !
Ptr->hasAllConstantIndices();
8310 ? TTI::PointersChainInfo::getUnknownStride()
8311 : TTI::PointersChainInfo::getKnownStride();
8315 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
8317 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
8318 if (It != Ptrs.
end())
8319 BaseGEP = cast<GEPOperator>(*It);
8324 BaseGEP->getPointerOperand(), Indices, VecTy,
8329 return std::make_pair(ScalarCost, VecCost);
8334 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8336 switch (E.getOpcode()) {
8337 case Instruction::Load: {
8340 if (E.State != TreeEntry::Vectorize)
8342 Type *ScalarTy = E.getMainOp()->getType();
8344 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8351 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8358 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8359 false, CommonAlignment,
CostKind, BaseLI);
8360 if (StridedCost < OriginalVecCost)
8363 E.State = TreeEntry::StridedVectorize;
8367 case Instruction::Store: {
8369 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8371 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8378 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8385 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8386 false, CommonAlignment,
CostKind, BaseSI);
8387 if (StridedCost < OriginalVecCost)
8390 E.State = TreeEntry::StridedVectorize;
8394 case Instruction::Select: {
8395 if (E.State != TreeEntry::Vectorize)
8401 E.CombinedOp = TreeEntry::MinMax;
8402 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
8403 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
8404 CondEntry->State == TreeEntry::Vectorize) {
8406 CondEntry->State = TreeEntry::CombinedVectorize;
8423 bool IsFinalized =
false;
8436 bool SameNodesEstimated =
true;
8445 if (
auto *VTy = dyn_cast<VectorType>(Ty))
8461 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8462 unsigned MinVF = R.getMinVF(2 * Sz);
8463 if (VL.
size() > 2 &&
8464 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8465 (InVectors.
empty() &&
8468 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8469 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8470 return S.getOpcode() == Instruction::Load &&
8473 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
8479 unsigned StartIdx = 0;
8480 unsigned VF = VL.
size() / 2;
8481 for (; VF >= MinVF; VF /= 2) {
8482 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
8485 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8487 if (SliceS.getOpcode() != Instruction::Load ||
8488 SliceS.isAltShuffle())
8496 CurrentOrder, PointerOps);
8506 CurrentOrder.
empty()) ||
8515 if (Cnt == StartIdx)
8524 if (StartIdx >= VL.
size())
8527 if (!VectorizedLoads.
empty())
8530 if (!VectorizedLoads.
empty()) {
8532 bool NeedInsertSubvectorAnalysis =
8533 !NumParts || (VL.
size() / VF) > NumParts;
8539 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
8546 for (
Value *V : VectorizedLoads) {
8547 auto *LI = cast<LoadInst>(V);
8554 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
8555 auto *LI = cast<LoadInst>(VL[
P.first]);
8564 false, Alignment, CostKind, LI);
8567 P.first, std::min<unsigned>(VL.
size() -
P.first, VF))))
8568 if (!R.areAllUsersVectorized(cast<Instruction>(V)))
8570 LoadTy, CostKind,
Idx);
8574 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8575 auto [ScalarGEPCost, VectorGEPCost] =
8577 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8578 GatherCost += VectorGEPCost - ScalarGEPCost;
8580 for (
unsigned P : ScatterVectorized) {
8581 auto *LI0 = cast<LoadInst>(VL[
P]);
8583 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8585 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8586 false, CommonAlignment, CostKind, LI0);
8590 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8598 auto [ScalarGEPCost, VectorGEPCost] =
8600 CostKind, ScalarTy, VecTy);
8601 GatherCost += VectorGEPCost - ScalarGEPCost;
8602 if (!Order.
empty()) {
8606 VecTy, Mask, CostKind);
8609 GatherCost += R.getGatherCost(PointerOps,
true,
8610 PointerOps.
front()->getType());
8613 if (NeedInsertSubvectorAnalysis) {
8616 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8617 for (
unsigned Idx : seq<unsigned>(0, E))
8620 ShuffleMask, CostKind,
I, LoadTy);
8623 GatherCost -= ScalarsCost;
8625 GatherCost = std::min(BaseCost, GatherCost);
8626 }
else if (!Root &&
isSplat(VL)) {
8629 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8630 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8633 count(VL, *It) > 1 &&
8637 CostKind, std::distance(VL.
begin(), It),
8642 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8649 VecTy, ShuffleMask, CostKind,
8654 (
all_of(Gathers, IsaPred<UndefValue>)
8656 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
8664 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8665 unsigned NumParts) {
8666 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8668 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8669 auto *EE = dyn_cast<ExtractElementInst>(V);
8672 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8675 return std::max(Sz, VecTy->getNumElements());
8681 -> std::optional<TTI::ShuffleKind> {
8682 if (NumElts <= EltsPerVector)
8683 return std::nullopt;
8685 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8687 if (I == PoisonMaskElem)
8689 return std::min(S, I);
8692 int OffsetReg1 = OffsetReg0;
8696 int FirstRegId = -1;
8697 Indices.assign(1, OffsetReg0);
8701 int Idx =
I - OffsetReg0;
8703 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
8706 RegIndices.
insert(RegId);
8707 if (RegIndices.
size() > 2)
8708 return std::nullopt;
8709 if (RegIndices.
size() == 2) {
8711 if (Indices.size() == 1) {
8714 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8716 if (I == PoisonMaskElem)
8718 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8719 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8720 if (RegId == FirstRegId)
8722 return std::min(S, I);
8725 Indices.push_back(OffsetReg1 % NumElts);
8727 Idx =
I - OffsetReg1;
8729 I = (
Idx % NumElts) % EltsPerVector +
8730 (RegId == FirstRegId ? 0 : EltsPerVector);
8739 for (
unsigned Part : seq<unsigned>(NumParts)) {
8740 if (!ShuffleKinds[Part])
8743 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
8747 std::optional<TTI::ShuffleKind> RegShuffleKind =
8748 CheckPerRegistersShuffle(SubMask, Indices);
8749 if (!RegShuffleKind) {
8752 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
8764 for (
unsigned Idx : Indices) {
8766 "SK_ExtractSubvector index out of range");
8770 std::nullopt, CostKind,
Idx,
8779 if (OriginalCost <
Cost)
8780 Cost = OriginalCost;
8788 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8795 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8797 unsigned SliceSize) {
8798 if (SameNodesEstimated) {
8804 if ((InVectors.
size() == 2 &&
8805 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8806 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8807 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8808 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
8811 "Expected all poisoned elements.");
8813 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8818 Cost += createShuffle(InVectors.
front(),
8819 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8821 transformMaskAfterShuffle(CommonMask, CommonMask);
8823 SameNodesEstimated =
false;
8824 if (!E2 && InVectors.
size() == 1) {
8825 unsigned VF = E1.getVectorFactor();
8828 cast<FixedVectorType>(V1->
getType())->getNumElements());
8830 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8831 VF = std::max(VF, E->getVectorFactor());
8833 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8835 CommonMask[
Idx] = Mask[
Idx] + VF;
8836 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8837 transformMaskAfterShuffle(CommonMask, CommonMask);
8839 Cost += createShuffle(&E1, E2, Mask);
8840 transformMaskAfterShuffle(CommonMask, Mask);
8844 class ShuffleCostBuilder {
8847 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8849 return Mask.empty() ||
8850 (VF == Mask.size() &&
8858 ~ShuffleCostBuilder() =
default;
8863 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8864 if (isEmptyOrIdentity(Mask, VF))
8867 cast<VectorType>(V1->
getType()), Mask);
8872 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8873 if (isEmptyOrIdentity(Mask, VF))
8876 cast<VectorType>(V1->
getType()), Mask);
8882 void resizeToMatch(
Value *&,
Value *&)
const {}
8892 ShuffleCostBuilder Builder(
TTI);
8895 unsigned CommonVF = Mask.size();
8897 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
8901 Type *EScalarTy = E.Scalars.front()->getType();
8902 bool IsSigned =
true;
8903 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8905 IsSigned = It->second.second;
8907 if (EScalarTy != ScalarTy) {
8908 unsigned CastOpcode = Instruction::Trunc;
8909 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8910 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8912 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8920 if (isa<Constant>(V))
8922 auto *VecTy = cast<VectorType>(V->getType());
8924 if (EScalarTy != ScalarTy) {
8926 unsigned CastOpcode = Instruction::Trunc;
8927 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8928 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8930 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8937 if (!V1 && !V2 && !P2.
isNull()) {
8939 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8940 unsigned VF = E->getVectorFactor();
8941 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8942 CommonVF = std::max(VF, E2->getVectorFactor());
8945 return Idx < 2 * static_cast<int>(CommonVF);
8947 "All elements in mask must be less than 2 * CommonVF.");
8948 if (E->Scalars.size() == E2->Scalars.size()) {
8952 for (
int &
Idx : CommonMask) {
8955 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8957 else if (
Idx >=
static_cast<int>(CommonVF))
8958 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8962 CommonVF = E->Scalars.size();
8963 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8964 GetNodeMinBWAffectedCost(*E2, CommonVF);
8966 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8967 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8971 }
else if (!V1 && P2.
isNull()) {
8973 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8974 unsigned VF = E->getVectorFactor();
8978 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8979 "All elements in mask must be less than CommonVF.");
8980 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8982 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8983 for (
int &
Idx : CommonMask) {
8987 CommonVF = E->Scalars.size();
8989 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8992 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8993 CommonVF == CommonMask.
size() &&
8995 [](
const auto &&
P) {
8997 static_cast<unsigned>(
P.value()) !=
P.index();
9005 }
else if (V1 && P2.
isNull()) {
9007 ExtraCost += GetValueMinBWAffectedCost(V1);
9008 CommonVF = getVF(V1);
9011 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
9012 "All elements in mask must be less than CommonVF.");
9013 }
else if (V1 && !V2) {
9015 unsigned VF = getVF(V1);
9016 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
9017 CommonVF = std::max(VF, E2->getVectorFactor());
9020 return Idx < 2 * static_cast<int>(CommonVF);
9022 "All elements in mask must be less than 2 * CommonVF.");
9023 if (E2->Scalars.size() == VF && VF != CommonVF) {
9025 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
9026 for (
int &
Idx : CommonMask) {
9029 if (
Idx >=
static_cast<int>(CommonVF))
9030 Idx = E2Mask[
Idx - CommonVF] + VF;
9034 ExtraCost += GetValueMinBWAffectedCost(V1);
9036 ExtraCost += GetNodeMinBWAffectedCost(
9037 *E2, std::min(CommonVF, E2->getVectorFactor()));
9039 }
else if (!V1 && V2) {
9041 unsigned VF = getVF(V2);
9042 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
9043 CommonVF = std::max(VF, E1->getVectorFactor());
9046 return Idx < 2 * static_cast<int>(CommonVF);
9048 "All elements in mask must be less than 2 * CommonVF.");
9049 if (E1->Scalars.size() == VF && VF != CommonVF) {
9051 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
9052 for (
int &
Idx : CommonMask) {
9055 if (
Idx >=
static_cast<int>(CommonVF))
9056 Idx = E1Mask[
Idx - CommonVF] + VF;
9062 ExtraCost += GetNodeMinBWAffectedCost(
9063 *E1, std::min(CommonVF, E1->getVectorFactor()));
9065 ExtraCost += GetValueMinBWAffectedCost(V2);
9068 assert(V1 && V2 &&
"Expected both vectors.");
9069 unsigned VF = getVF(V1);
9070 CommonVF = std::max(VF, getVF(V2));
9073 return Idx < 2 * static_cast<int>(CommonVF);
9075 "All elements in mask must be less than 2 * CommonVF.");
9077 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
9078 if (V1->
getType() != V2->getType()) {
9082 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
9084 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
9088 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9095 if (InVectors.
size() == 2)
9097 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
9098 V1, V2, CommonMask, Builder);
9105 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
9106 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
9107 CheckedExtracts(CheckedExtracts) {}
9109 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
9110 unsigned NumParts,
bool &UseVecBaseAsInput) {
9111 UseVecBaseAsInput =
false;
9114 Value *VecBase =
nullptr;
9117 if (NumParts == VL.
size())
9121 bool PrevNodeFound =
any_of(
9123 [&](
const std::unique_ptr<TreeEntry> &TE) {
9124 return ((!TE->isAltShuffle() &&
9125 TE->getOpcode() == Instruction::ExtractElement) ||
9127 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
9128 return VL.size() > Data.index() &&
9129 (Mask[Data.index()] == PoisonMaskElem ||
9130 isa<UndefValue>(VL[Data.index()]) ||
9131 Data.value() == VL[Data.index()]);
9136 for (
unsigned Part : seq<unsigned>(NumParts)) {
9138 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
9141 if (isa<UndefValue>(V) ||
9150 auto *EE = cast<ExtractElementInst>(V);
9151 VecBase = EE->getVectorOperand();
9152 UniqueBases.
insert(VecBase);
9153 const TreeEntry *VE = R.getTreeEntry(V);
9154 if (!CheckedExtracts.
insert(V).second ||
9155 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
9158 return isa<GetElementPtrInst>(U) &&
9159 !R.areAllUsersVectorized(cast<Instruction>(U),
9167 unsigned Idx = *EEIdx;
9169 if (EE->hasOneUse() || !PrevNodeFound) {
9171 if (isa<SExtInst, ZExtInst>(Ext) &&
9172 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9177 EE->getVectorOperandType(),
Idx);
9180 Ext->getOpcode(), Ext->getType(), EE->getType(),
9196 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
9199 transformMaskAfterShuffle(CommonMask, CommonMask);
9200 SameNodesEstimated =
false;
9201 if (NumParts != 1 && UniqueBases.
size() != 1) {
9202 UseVecBaseAsInput =
true;
9210 std::optional<InstructionCost>
9214 return std::nullopt;
9220 return Idx < static_cast<int>(E1.getVectorFactor());
9222 "Expected single vector shuffle mask.");
9226 if (InVectors.
empty()) {
9227 CommonMask.
assign(Mask.begin(), Mask.end());
9228 InVectors.
assign({&E1, &E2});
9231 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
9234 if (NumParts == 0 || NumParts >= Mask.size())
9239 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9240 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9243 if (InVectors.
empty()) {
9244 CommonMask.
assign(Mask.begin(), Mask.end());
9245 InVectors.
assign(1, &E1);
9248 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
9251 if (NumParts == 0 || NumParts >= Mask.size())
9256 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9257 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
9258 if (!SameNodesEstimated && InVectors.
size() == 1)
9271 cast<ExtractElementInst>(InVectors.
front()
9272 .get<
const TreeEntry *>()
9273 ->Scalars[
P.index()]);
9274 return EI->getVectorOperand() == V1 ||
9275 EI->getVectorOperand() == V2;
9277 "Expected extractelement vectors.");
9281 if (InVectors.
empty()) {
9283 "Expected empty input mask/vectors.");
9284 CommonMask.
assign(Mask.begin(), Mask.end());
9291 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
9295 .get<const TreeEntry *>()
9296 ->Scalars[
P.index()];
9298 return P.value() == Mask[
P.index()] ||
9299 isa<UndefValue>(Scalar);
9300 if (isa<Constant>(V1))
9302 auto *EI = cast<ExtractElementInst>(Scalar);
9303 return EI->getVectorOperand() == V1;
9305 "Expected only tree entry for extractelement vectors.");
9309 "Expected only tree entries from extracts/reused buildvectors.");
9310 unsigned VF = getVF(V1);
9311 if (InVectors.
size() == 2) {
9312 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
9313 transformMaskAfterShuffle(CommonMask, CommonMask);
9314 VF = std::max<unsigned>(VF, CommonMask.
size());
9315 }
else if (
const auto *InTE =
9316 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
9317 VF = std::max(VF, InTE->getVectorFactor());
9321 ->getNumElements());
9324 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
9326 CommonMask[
Idx] = Mask[
Idx] + VF;
9329 Value *Root =
nullptr) {
9330 Cost += getBuildVectorCost(VL, Root);
9334 unsigned VF = VL.
size();
9336 VF = std::min(VF, MaskVF);
9338 if (isa<UndefValue>(V)) {
9344 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
9351 Type *ScalarTy = V->getType()->getScalarType();
9353 if (isa<PoisonValue>(V))
9355 else if (isa<UndefValue>(V))
9359 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
9368 cast<FixedVectorType>(Root->
getType())->getNumElements()),
9379 if (InVectors.
size() == 2)
9380 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
9382 Cost += createShuffle(Vec,
nullptr, CommonMask);
9383 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
9387 "Expected vector length for the final value before action.");
9389 Action(V, CommonMask);
9390 InVectors.
front() = V;
9393 if (CommonMask.
empty()) {
9394 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
9398 createShuffle(InVectors.
front(),
9399 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
9405 "Shuffle construction must be finalized.");
9409const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
9410 unsigned Idx)
const {
9412 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
9413 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9414 return EI.EdgeIdx == Idx && EI.UserTE == E;
9415 }) != TE->UserTreeIndices.end())
9417 auto MIt = MultiNodeScalars.
find(
Op);
9418 if (MIt != MultiNodeScalars.
end()) {
9419 for (
const TreeEntry *TE : MIt->second) {
9420 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9421 return EI.EdgeIdx == Idx && EI.UserTE == E;
9422 }) != TE->UserTreeIndices.end())
9428 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9429 return TE->isGather() &&
9430 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9431 return EI.EdgeIdx == Idx && EI.UserTE == E;
9432 }) !=
TE->UserTreeIndices.end();
9434 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
9439 if (
TE.State == TreeEntry::ScatterVectorize ||
9440 TE.State == TreeEntry::StridedVectorize)
9442 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
9443 !
TE.isAltShuffle()) {
9444 if (
TE.ReorderIndices.empty())
9483 Type *ScalarTy = VL[0]->getType();
9484 if (!E->isGather()) {
9485 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
9486 ScalarTy =
SI->getValueOperand()->getType();
9487 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
9489 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9490 ScalarTy =
IE->getOperand(1)->getType();
9498 auto It = MinBWs.
find(E);
9499 Type *OrigScalarTy = ScalarTy;
9500 if (It != MinBWs.
end()) {
9501 auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
9507 unsigned EntryVF = E->getVectorFactor();
9510 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9511 if (E->isGather()) {
9514 if (isa<InsertElementInst>(VL[0]))
9516 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9517 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
9522 if (!E->ReorderIndices.empty() &&
9523 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9525 if (E->getOpcode() == Instruction::Store) {
9527 NewMask.
resize(E->ReorderIndices.size());
9528 copy(E->ReorderIndices, NewMask.
begin());
9534 if (NeedToShuffleReuses)
9535 ::addMask(Mask, E->ReuseShuffleIndices);
9539 assert((E->State == TreeEntry::Vectorize ||
9540 E->State == TreeEntry::ScatterVectorize ||
9541 E->State == TreeEntry::StridedVectorize) &&
9545 (E->getOpcode() == Instruction::GetElementPtr &&
9546 E->getMainOp()->getType()->isPointerTy())) &&
9549 unsigned ShuffleOrOp =
9550 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
9551 if (E->CombinedOp != TreeEntry::NotCombinedOp)
9552 ShuffleOrOp = E->CombinedOp;
9554 const unsigned Sz = UniqueValues.
size();
9556 for (
unsigned I = 0;
I < Sz; ++
I) {
9557 if (getTreeEntry(UniqueValues[
I]) == E)
9561 auto GetCastContextHint = [&](
Value *
V) {
9562 if (
const TreeEntry *OpTE = getTreeEntry(V))
9563 return getCastContextHint(*OpTE);
9564 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
9565 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9574 if (isa<CastInst, CallInst>(VL0)) {
9578 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9580 for (
unsigned I = 0;
I < Sz; ++
I) {
9581 if (UsedScalars.test(
I))
9583 ScalarCost += ScalarEltCost(
I);
9591 const EdgeInfo &EI = E->UserTreeIndices.front();
9592 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9594 It != MinBWs.
end()) {
9595 auto UserBWIt = MinBWs.
find(EI.UserTE);
9596 Type *UserScalarTy =
9597 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9598 if (UserBWIt != MinBWs.
end())
9600 UserBWIt->second.first);
9601 if (ScalarTy != UserScalarTy) {
9602 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9603 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
9608 VecOpcode = Instruction::Trunc;
9611 It->second.second ? Instruction::SExt : Instruction::ZExt;
9618 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9619 ScalarCost,
"Calculated costs for Tree"));
9620 return VecCost - ScalarCost;
9625 assert((E->State == TreeEntry::Vectorize ||
9626 E->State == TreeEntry::StridedVectorize) &&
9627 "Entry state expected to be Vectorize or StridedVectorize here.");
9631 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
9632 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9633 "Calculated GEPs cost for Tree"));
9635 return VecCost - ScalarCost;
9642 Type *CanonicalType = Ty;
9649 {CanonicalType, CanonicalType});
9654 if (VI && SelectOnly) {
9656 auto *CI = cast<CmpInst>(
VI->getOperand(0));
9661 return IntrinsicCost;
9663 switch (ShuffleOrOp) {
9664 case Instruction::PHI: {
9668 for (
Value *V : UniqueValues) {
9669 auto *
PHI = dyn_cast<PHINode>(V);
9674 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
9678 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
9680 if (!OpTE->ReuseShuffleIndices.empty())
9681 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9682 OpTE->Scalars.size());
9685 return CommonCost - ScalarCost;
9687 case Instruction::ExtractValue:
9688 case Instruction::ExtractElement: {
9689 auto GetScalarCost = [&](
unsigned Idx) {
9690 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
9692 if (ShuffleOrOp == Instruction::ExtractElement) {
9693 auto *EE = cast<ExtractElementInst>(
I);
9694 SrcVecTy = EE->getVectorOperandType();
9696 auto *EV = cast<ExtractValueInst>(
I);
9697 Type *AggregateTy = EV->getAggregateOperand()->getType();
9699 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9700 NumElts = ATy->getNumElements();
9705 if (
I->hasOneUse()) {
9707 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9708 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
9715 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9723 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9724 return GetCostDiff(GetScalarCost, GetVectorCost);
9726 case Instruction::InsertElement: {
9727 assert(E->ReuseShuffleIndices.empty() &&
9728 "Unique insertelements only are expected.");
9729 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9730 unsigned const NumElts = SrcVecTy->getNumElements();
9731 unsigned const NumScalars = VL.
size();
9737 unsigned OffsetEnd = OffsetBeg;
9738 InsertMask[OffsetBeg] = 0;
9741 if (OffsetBeg >
Idx)
9743 else if (OffsetEnd <
Idx)
9745 InsertMask[
Idx] =
I + 1;
9749 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9750 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9752 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9753 unsigned InsertVecSz = std::min<unsigned>(
9755 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9756 bool IsWholeSubvector =
9757 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9761 if (OffsetBeg + InsertVecSz > VecSz) {
9764 InsertVecSz = VecSz;
9770 if (!E->ReorderIndices.empty()) {
9775 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9777 bool IsIdentity =
true;
9779 Mask.swap(PrevMask);
9780 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9782 DemandedElts.
setBit(InsertIdx);
9783 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9784 Mask[InsertIdx - OffsetBeg] =
I;
9786 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9801 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9802 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9810 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9811 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9812 if (InsertVecSz != VecSz) {
9824 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9833 case Instruction::ZExt:
9834 case Instruction::SExt:
9835 case Instruction::FPToUI:
9836 case Instruction::FPToSI:
9837 case Instruction::FPExt:
9838 case Instruction::PtrToInt:
9839 case Instruction::IntToPtr:
9840 case Instruction::SIToFP:
9841 case Instruction::UIToFP:
9842 case Instruction::Trunc:
9843 case Instruction::FPTrunc:
9844 case Instruction::BitCast: {
9845 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9848 unsigned Opcode = ShuffleOrOp;
9849 unsigned VecOpcode = Opcode;
9851 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9853 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
9854 if (SrcIt != MinBWs.
end()) {
9855 SrcBWSz = SrcIt->second.first;
9862 if (BWSz == SrcBWSz) {
9863 VecOpcode = Instruction::BitCast;
9864 }
else if (BWSz < SrcBWSz) {
9865 VecOpcode = Instruction::Trunc;
9866 }
else if (It != MinBWs.
end()) {
9867 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9868 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9869 }
else if (SrcIt != MinBWs.
end()) {
9870 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9872 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9874 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9875 !SrcIt->second.second) {
9876 VecOpcode = Instruction::UIToFP;
9879 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9887 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9889 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9893 VecOpcode == Opcode ? VI :
nullptr);
9895 return GetCostDiff(GetScalarCost, GetVectorCost);
9897 case Instruction::FCmp:
9898 case Instruction::ICmp:
9899 case Instruction::Select: {
9903 match(VL0, MatchCmp))
9909 auto GetScalarCost = [&](
unsigned Idx) {
9910 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9916 !
match(VI, MatchCmp)) ||
9917 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9923 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9927 ScalarCost = IntrinsicCost;
9935 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9936 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
9939 unsigned CondNumElements = CondType->getNumElements();
9941 assert(VecTyNumElements >= CondNumElements &&
9942 VecTyNumElements % CondNumElements == 0 &&
9943 "Cannot vectorize Instruction::Select");
9944 if (CondNumElements != VecTyNumElements) {
9953 return VecCost + CommonCost;
9955 return GetCostDiff(GetScalarCost, GetVectorCost);
9957 case TreeEntry::MinMax: {
9958 auto GetScalarCost = [&](
unsigned Idx) {
9959 return GetMinMaxCost(OrigScalarTy);
9963 return VecCost + CommonCost;
9965 return GetCostDiff(GetScalarCost, GetVectorCost);
9967 case Instruction::FNeg:
9968 case Instruction::Add:
9969 case Instruction::FAdd:
9970 case Instruction::Sub:
9971 case Instruction::FSub:
9972 case Instruction::Mul:
9973 case Instruction::FMul:
9974 case Instruction::UDiv:
9975 case Instruction::SDiv:
9976 case Instruction::FDiv:
9977 case Instruction::URem:
9978 case Instruction::SRem:
9979 case Instruction::FRem:
9980 case Instruction::Shl:
9981 case Instruction::LShr:
9982 case Instruction::AShr:
9983 case Instruction::And:
9984 case Instruction::Or:
9985 case Instruction::Xor: {
9986 auto GetScalarCost = [&](
unsigned Idx) {
9987 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9988 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9997 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
9998 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
10001 auto *CI = dyn_cast<ConstantInt>(
Op);
10002 return CI && CI->getValue().countr_one() >= It->second.first;
10007 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
10011 Op2Info, std::nullopt,
nullptr, TLI) +
10014 return GetCostDiff(GetScalarCost, GetVectorCost);
10016 case Instruction::GetElementPtr: {
10017 return CommonCost + GetGEPCostDiff(VL, VL0);
10019 case Instruction::Load: {
10020 auto GetScalarCost = [&](
unsigned Idx) {
10021 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
10023 VI->getAlign(),
VI->getPointerAddressSpace(),
10026 auto *LI0 = cast<LoadInst>(VL0);
10029 if (E->State == TreeEntry::Vectorize) {
10031 Instruction::Load, VecTy, LI0->getAlign(),
10033 }
else if (E->State == TreeEntry::StridedVectorize) {
10034 Align CommonAlignment =
10035 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10037 Instruction::Load, VecTy, LI0->getPointerOperand(),
10038 false, CommonAlignment,
CostKind);
10040 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
10041 Align CommonAlignment =
10042 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10044 Instruction::Load, VecTy, LI0->getPointerOperand(),
10045 false, CommonAlignment,
CostKind);
10047 return VecLdCost + CommonCost;
10053 if (E->State == TreeEntry::ScatterVectorize)
10059 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
10060 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
10062 case Instruction::Store: {
10063 bool IsReorder = !E->ReorderIndices.empty();
10064 auto GetScalarCost = [=](
unsigned Idx) {
10065 auto *
VI = cast<StoreInst>(VL[
Idx]);
10068 VI->getAlign(),
VI->getPointerAddressSpace(),
10072 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
10076 if (E->State == TreeEntry::StridedVectorize) {
10077 Align CommonAlignment =
10078 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
10080 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10081 false, CommonAlignment,
CostKind);
10083 assert(E->State == TreeEntry::Vectorize &&
10084 "Expected either strided or consecutive stores.");
10087 Instruction::Store, VecTy, BaseSI->getAlign(),
10088 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
10090 return VecStCost + CommonCost;
10094 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
10095 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
10098 return GetCostDiff(GetScalarCost, GetVectorCost) +
10099 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
10101 case Instruction::Call: {
10102 auto GetScalarCost = [&](
unsigned Idx) {
10103 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
10114 auto *CI = cast<CallInst>(VL0);
10118 It != MinBWs.
end() ? It->second.first : 0);
10120 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
10122 return GetCostDiff(GetScalarCost, GetVectorCost);
10124 case Instruction::ShuffleVector: {
10125 if (!
SLPReVec || E->isAltShuffle())
10126 assert(E->isAltShuffle() &&
10131 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
10132 "Invalid Shuffle Vector Operand");
10135 auto TryFindNodeWithEqualOperands = [=]() {
10136 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10139 if (
TE->isAltShuffle() &&
10140 ((
TE->getOpcode() == E->getOpcode() &&
10141 TE->getAltOpcode() == E->getAltOpcode()) ||
10142 (
TE->getOpcode() == E->getAltOpcode() &&
10143 TE->getAltOpcode() == E->getOpcode())) &&
10144 TE->hasEqualOperands(*E))
10149 auto GetScalarCost = [&](
unsigned Idx) {
10150 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
10151 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
10161 if (TryFindNodeWithEqualOperands()) {
10163 dbgs() <<
"SLP: diamond match for alternate node found.\n";
10170 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
10172 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
10173 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
10175 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
10176 CI0->getPredicate(),
CostKind, VL0);
10177 VecCost += TTIRef.getCmpSelInstrCost(
10178 E->getOpcode(), VecTy, MaskTy,
10179 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
10182 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
10185 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
10186 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
10188 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
10189 if (SrcIt != MinBWs.
end()) {
10190 SrcBWSz = SrcIt->second.first;
10194 if (BWSz <= SrcBWSz) {
10195 if (BWSz < SrcBWSz)
10197 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
10201 <<
"SLP: alternate extension, which should be truncated.\n";
10207 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
10210 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
10214 E->buildAltOpShuffleMask(
10216 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
10227 unsigned Opcode0 = E->getOpcode();
10228 unsigned Opcode1 = E->getAltOpcode();
10232 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
10234 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
10235 return AltVecCost < VecCost ? AltVecCost : VecCost;
10240 if (
SLPReVec && !E->isAltShuffle())
10247 return GetCostDiff(GetScalarCost, GetVectorCost);
10249 case Instruction::Freeze:
10256bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
10258 << VectorizableTree.size() <<
" is fully vectorizable .\n");
10260 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
10262 return TE->isGather() &&
10264 [
this](
Value *V) { return EphValues.contains(V); }) &&
10266 TE->Scalars.size() < Limit ||
10267 ((
TE->getOpcode() == Instruction::ExtractElement ||
10268 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
10270 (
TE->isGather() &&
TE->getOpcode() == Instruction::Load &&
10271 !
TE->isAltShuffle()));
10275 if (VectorizableTree.size() == 1 &&
10276 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10277 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
10279 AreVectorizableGathers(VectorizableTree[0].
get(),
10280 VectorizableTree[0]->Scalars.size()) &&
10281 VectorizableTree[0]->getVectorFactor() > 2)))
10284 if (VectorizableTree.size() != 2)
10292 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10293 AreVectorizableGathers(VectorizableTree[1].
get(),
10294 VectorizableTree[0]->Scalars.size()))
10298 if (VectorizableTree[0]->
isGather() ||
10299 (VectorizableTree[1]->isGather() &&
10300 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10301 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10309 bool MustMatchOrInst) {
10313 Value *ZextLoad = Root;
10314 const APInt *ShAmtC;
10315 bool FoundOr =
false;
10316 while (!isa<ConstantExpr>(ZextLoad) &&
10319 ShAmtC->
urem(8) == 0))) {
10320 auto *BinOp = cast<BinaryOperator>(ZextLoad);
10321 ZextLoad = BinOp->getOperand(0);
10322 if (BinOp->getOpcode() == Instruction::Or)
10327 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10334 Type *SrcTy = Load->getType();
10341 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
10342 << *(cast<Instruction>(Root)) <<
"\n");
10351 unsigned NumElts = VectorizableTree[0]->Scalars.size();
10352 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10360 unsigned NumElts = Stores.
size();
10361 for (
Value *Scalar : Stores) {
10372 if (VectorizableTree.size() == 2 &&
10373 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
10374 VectorizableTree[1]->isGather() &&
10375 (VectorizableTree[1]->getVectorFactor() <= 2 ||
10376 !(
isSplat(VectorizableTree[1]->Scalars) ||
10384 constexpr int Limit = 4;
10386 !VectorizableTree.empty() &&
10387 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10388 return (TE->isGather() &&
10389 TE->getOpcode() != Instruction::ExtractElement &&
10390 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
10391 TE->getOpcode() == Instruction::PHI;
10402 if (isFullyVectorizableTinyTree(ForReduction))
10407 bool IsAllowedSingleBVNode =
10408 VectorizableTree.size() > 1 ||
10409 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10410 !VectorizableTree.front()->isAltShuffle() &&
10411 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10412 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10414 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10415 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
10416 return isa<ExtractElementInst, UndefValue>(V) ||
10417 (IsAllowedSingleBVNode &&
10418 !V->hasNUsesOrMore(UsesLimit) &&
10419 any_of(V->users(), IsaPred<InsertElementInst>));
10424 assert(VectorizableTree.empty()
10425 ? ExternalUses.empty()
10426 :
true &&
"We shouldn't have any external users");
10438 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10451 for (
const auto &TEPtr : VectorizableTree) {
10452 if (TEPtr->State != TreeEntry::Vectorize)
10454 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10460 auto *NodeA = DT->
getNode(
A->getParent());
10461 auto *NodeB = DT->
getNode(
B->getParent());
10462 assert(NodeA &&
"Should only process reachable instructions");
10463 assert(NodeB &&
"Should only process reachable instructions");
10464 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10465 "Different nodes should have different DFS numbers");
10466 if (NodeA != NodeB)
10467 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10468 return B->comesBefore(
A);
10478 LiveValues.
erase(PrevInst);
10479 for (
auto &J : PrevInst->
operands()) {
10480 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10481 LiveValues.
insert(cast<Instruction>(&*J));
10485 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
10486 for (
auto *
X : LiveValues)
10487 dbgs() <<
" " <<
X->getName();
10488 dbgs() <<
", Looking at ";
10493 unsigned NumCalls = 0;
10497 while (InstIt != PrevInstIt) {
10498 if (PrevInstIt == PrevInst->
getParent()->rend()) {
10499 PrevInstIt = Inst->getParent()->rbegin();
10504 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
10505 if (
II->isAssumeLikeIntrinsic())
10509 for (
auto &ArgOp :
II->args())
10511 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
10512 FMF = FPMO->getFastMathFlags();
10519 if (IntrCost < CallCost)
10526 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10527 &*PrevInstIt != PrevInst)
10535 for (
auto *
II : LiveValues) {
10536 auto *ScalarTy =
II->getType();
10537 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10538 ScalarTy = VectorTy->getElementType();
10556 const auto *I1 = IE1;
10557 const auto *I2 = IE2;
10569 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10571 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10572 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
10574 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10575 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10582struct ValueSelect {
10583 template <
typename U>
10584 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
10587 template <
typename U>
10588 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
10606template <
typename T>
10612 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
10614 auto VMIt = std::next(ShuffleMask.begin());
10617 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10619 if (!IsBaseUndef.
all()) {
10621 std::pair<T *, bool> Res =
10622 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
10624 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
10628 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
10630 auto *V = ValueSelect::get<T *>(
Base);
10632 assert((!V || GetVF(V) == Mask.size()) &&
10633 "Expected base vector of VF number of elements.");
10634 Prev = Action(Mask, {
nullptr, Res.first});
10635 }
else if (ShuffleMask.size() == 1) {
10638 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10644 Prev = Action(Mask, {ShuffleMask.begin()->first});
10648 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10649 unsigned Vec2VF = GetVF(VMIt->first);
10650 if (Vec1VF == Vec2VF) {
10654 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10657 Mask[
I] = SecMask[
I] + Vec1VF;
10660 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10663 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10665 std::pair<T *, bool> Res2 =
10666 ResizeAction(VMIt->first, VMIt->second,
false);
10668 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10675 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
10678 Prev = Action(Mask, {Res1.first, Res2.first});
10680 VMIt = std::next(VMIt);
10682 bool IsBaseNotUndef = !IsBaseUndef.
all();
10683 (void)IsBaseNotUndef;
10685 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10687 std::pair<T *, bool> Res =
10688 ResizeAction(VMIt->first, VMIt->second,
false);
10690 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10693 "Multiple uses of scalars.");
10694 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
10699 Prev = Action(Mask, {Prev, Res.first});
10707 << VectorizableTree.size() <<
".\n");
10709 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10712 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
10713 TreeEntry &TE = *VectorizableTree[
I];
10716 if (TE.State == TreeEntry::CombinedVectorize) {
10718 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
10719 << *TE.Scalars[0] <<
".\n";
10720 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10723 if (TE.isGather()) {
10724 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
10725 E && E->getVectorFactor() == TE.getVectorFactor() &&
10726 E->isSame(TE.Scalars)) {
10731 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10740 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10750 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10752 for (ExternalUser &EU : ExternalUses) {
10754 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10755 !ExtractCostCalculated.
insert(EU.Scalar).second)
10761 if (EphValues.
count(EU.User))
10765 if (isa<FixedVectorType>(EU.Scalar->getType()))
10770 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10772 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
10773 if (!UsedInserts.
insert(VU).second)
10777 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10780 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10782 VU, cast<InsertElementInst>(Pair.first),
10784 Value *Op0 = II->getOperand(0);
10785 if (getTreeEntry(II) && !getTreeEntry(Op0))
10791 if (It == FirstUsers.
end()) {
10798 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10799 if (IEBase != EU.User &&
10800 (!IEBase->hasOneUse() ||
10804 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10807 IEBase = cast<InsertElementInst>(
Base);
10810 "InsertElementInstruction used already.");
10812 Base = IEBase->getOperand(0);
10813 }
while (E == getTreeEntry(
Base));
10816 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10820 VecId = FirstUsers.
size() - 1;
10821 auto It = MinBWs.
find(ScalarTE);
10822 if (It != MinBWs.
end() &&
10824 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10826 unsigned BWSz = It->second.first;
10827 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10828 unsigned VecOpcode;
10829 if (DstBWSz < BWSz)
10830 VecOpcode = Instruction::Trunc;
10833 It->second.second ? Instruction::SExt : Instruction::ZExt;
10838 FTy->getNumElements()),
10841 <<
" for extending externally used vector with "
10842 "non-equal minimum bitwidth.\n");
10848 VecId = std::distance(FirstUsers.
begin(), It);
10850 int InIdx = *InsertIdx;
10854 Mask[InIdx] = EU.Lane;
10855 DemandedElts[VecId].setBit(InIdx);
10866 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
10867 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
10868 auto It = MinBWs.
find(Entry);
10869 if (It != MinBWs.
end()) {
10872 It->second.second ? Instruction::SExt : Instruction::ZExt;
10881 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
10882 Entry->getOpcode() == Instruction::Load) {
10883 if (!ValueToExtUses) {
10884 ValueToExtUses.emplace();
10887 if (
auto *Phi = dyn_cast_if_present<PHINode>(
P.value().User)) {
10888 auto *I = cast<Instruction>(P.value().Scalar);
10889 const Loop *L = LI->getLoopFor(Phi->getParent());
10890 if (L && (Phi->getParent() == I->getParent() ||
10891 L == LI->getLoopFor(I->getParent())))
10895 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10900 auto *Inst = cast<Instruction>(EU.Scalar);
10901 bool CanBeUsedAsScalar =
all_of(Inst->operands(), [&](
Value *V) {
10902 if (!getTreeEntry(V)) {
10906 if (auto *EE = dyn_cast<ExtractElementInst>(V))
10907 return !EE->hasOneUse() || !MustGather.contains(EE);
10910 return ValueToExtUses->contains(V);
10912 if (CanBeUsedAsScalar) {
10914 bool KeepScalar = ScalarCost <= ExtraCost;
10917 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
10918 return ValueToExtUses->contains(V);
10920 auto It = ExtractsCount.
find(Entry);
10921 if (It != ExtractsCount.
end())
10922 ScalarUsesCount -= It->getSecond().size();
10926 KeepScalar = ScalarUsesCount <= 1 || !
isPowerOf2_32(ScalarUsesCount);
10929 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
10931 auto It = ValueToExtUses->find(V);
10932 if (It != ValueToExtUses->end()) {
10934 ExternalUses[It->second].User = nullptr;
10937 ExtraCost = ScalarCost;
10938 ExtractsCount[Entry].
insert(Inst);
10943 ExtractCost += ExtraCost;
10946 if (!VectorizedVals.
empty()) {
10947 const TreeEntry &Root = *VectorizableTree.front();
10948 auto BWIt = MinBWs.find(&Root);
10949 if (BWIt != MinBWs.end()) {
10950 Type *DstTy = Root.Scalars.front()->getType();
10951 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10953 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10954 if (OriginalSz != SrcSz) {
10955 unsigned Opcode = Instruction::Trunc;
10956 if (OriginalSz > SrcSz)
10957 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10967 Cost += SpillCost + ExtractCost;
10971 unsigned VF =
Mask.size();
10972 unsigned VecVF =
TE->getVectorFactor();
10974 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10977 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10983 dbgs() <<
"SLP: Adding cost " <<
C
10984 <<
" for final shuffle of insertelement external users.\n";
10985 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10987 return std::make_pair(TE,
true);
10989 return std::make_pair(TE,
false);
10992 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10993 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10994 auto Vector = ShuffleMasks[
I].takeVector();
10998 assert((TEs.size() == 1 || TEs.size() == 2) &&
10999 "Expected exactly 1 or 2 tree entries.");
11000 if (TEs.size() == 1) {
11002 VF = TEs.front()->getVectorFactor();
11003 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11007 (
Data.index() < VF &&
11008 static_cast<int>(
Data.index()) ==
Data.value());
11013 <<
" for final shuffle of insertelement "
11014 "external users.\n";
11015 TEs.front()->
dump();
11016 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
11022 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
11023 VF = TEs.front()->getVectorFactor();
11027 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11031 <<
" for final shuffle of vector node and external "
11032 "insertelement users.\n";
11033 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
11034 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
11040 (void)performExtractsShuffleAction<const TreeEntry>(
11042 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
11043 EstimateShufflesCost);
11045 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
11047 Cost -= InsertCost;
11051 if (ReductionBitWidth != 0) {
11052 assert(UserIgnoreList &&
"Expected reduction tree.");
11053 const TreeEntry &E = *VectorizableTree.front();
11054 auto It = MinBWs.find(&E);
11055 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
11056 unsigned SrcSize = It->second.first;
11057 unsigned DstSize = ReductionBitWidth;
11058 unsigned Opcode = Instruction::Trunc;
11059 if (SrcSize < DstSize)
11060 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11062 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
11064 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
11067 switch (E.getOpcode()) {
11068 case Instruction::SExt:
11069 case Instruction::ZExt:
11070 case Instruction::Trunc: {
11071 const TreeEntry *OpTE = getOperandEntry(&E, 0);
11072 CCH = getCastContextHint(*OpTE);
11082 <<
" for final resize for reduction from " << SrcVecTy
11083 <<
" to " << DstVecTy <<
"\n";
11084 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
11092 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
11093 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
11094 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
11098 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
11109std::optional<TTI::ShuffleKind>
11110BoUpSLP::tryToGatherSingleRegisterExtractElements(
11116 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11117 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
11119 if (isa<UndefValue>(VL[
I]))
11123 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
11124 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
11133 ExtractMask.reset(*
Idx);
11138 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
11143 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
11144 return P1.second.size() > P2.second.size();
11147 const int UndefSz = UndefVectorExtracts.
size();
11148 unsigned SingleMax = 0;
11149 unsigned PairMax = 0;
11150 if (!Vectors.
empty()) {
11151 SingleMax = Vectors.
front().second.size() + UndefSz;
11152 if (Vectors.
size() > 1) {
11153 auto *ItNext = std::next(Vectors.
begin());
11154 PairMax = SingleMax + ItNext->second.size();
11157 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
11158 return std::nullopt;
11164 if (SingleMax >= PairMax && SingleMax) {
11165 for (
int Idx : Vectors.
front().second)
11167 }
else if (!Vectors.
empty()) {
11168 for (
unsigned Idx : {0, 1})
11169 for (
int Idx : Vectors[
Idx].second)
11173 for (
int Idx : UndefVectorExtracts)
11177 std::optional<TTI::ShuffleKind> Res =
11183 return std::nullopt;
11187 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
11188 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
11189 isa<UndefValue>(GatheredExtracts[
I])) {
11193 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
11194 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
11195 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
11210 unsigned NumParts)
const {
11211 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
11215 for (
unsigned Part : seq<unsigned>(NumParts)) {
11221 std::optional<TTI::ShuffleKind> Res =
11222 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
11223 ShufflesRes[Part] = Res;
11224 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
11226 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
11227 return Res.has_value();
11229 ShufflesRes.clear();
11230 return ShufflesRes;
11233std::optional<TargetTransformInfo::ShuffleKind>
11234BoUpSLP::isGatherShuffledSingleRegisterEntry(
11240 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
11241 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
11245 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
11246 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
11249 TEInsertBlock = TEInsertPt->
getParent();
11252 return std::nullopt;
11253 auto *NodeUI = DT->
getNode(TEInsertBlock);
11254 assert(NodeUI &&
"Should only process reachable instructions");
11256 auto CheckOrdering = [&](
const Instruction *InsertPt) {
11270 auto *NodeEUI = DT->
getNode(InsertBlock);
11273 assert((NodeUI == NodeEUI) ==
11274 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
11275 "Different nodes should have different DFS numbers");
11277 if (TEInsertPt->
getParent() != InsertBlock &&
11280 if (TEInsertPt->
getParent() == InsertBlock &&
11294 for (
Value *V : VL) {
11299 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
11303 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
11304 "Must contain at least single gathered value.");
11305 assert(TEPtr->UserTreeIndices.size() == 1 &&
11306 "Expected only single user of a gather node.");
11307 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11309 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
11312 : &getLastInstructionInBundle(UseEI.UserTE);
11313 if (TEInsertPt == InsertPt) {
11317 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11321 if (TEUseEI.UserTE != UseEI.UserTE &&
11322 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11328 if ((TEInsertBlock != InsertPt->
getParent() ||
11329 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11330 !CheckOrdering(InsertPt))
11334 if (
const TreeEntry *VTE = getTreeEntry(V)) {
11336 if (VTE->State != TreeEntry::Vectorize) {
11337 auto It = MultiNodeScalars.
find(V);
11338 if (It == MultiNodeScalars.
end())
11340 VTE = *It->getSecond().begin();
11342 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
11343 return MTE->State == TreeEntry::Vectorize;
11345 if (MIt == It->getSecond().end())
11350 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
11351 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11355 if (VToTEs.
empty())
11357 if (UsedTEs.
empty()) {
11371 if (!VToTEs.
empty()) {
11377 VToTEs = SavedVToTEs;
11386 if (UsedTEs.
size() == 2)
11388 UsedTEs.push_back(SavedVToTEs);
11395 if (UsedTEs.
empty()) {
11397 return std::nullopt;
11401 if (UsedTEs.
size() == 1) {
11404 UsedTEs.front().
end());
11405 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
11406 return TE1->Idx < TE2->Idx;
11409 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
11410 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
11412 if (It != FirstEntries.end() &&
11413 ((*It)->getVectorFactor() == VL.size() ||
11414 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
11415 TE->ReuseShuffleIndices.size() == VL.size() &&
11416 (*It)->isSame(
TE->Scalars)))) {
11417 Entries.push_back(*It);
11418 if ((*It)->getVectorFactor() == VL.size()) {
11419 std::iota(std::next(
Mask.begin(), Part * VL.size()),
11420 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
11426 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
11427 if (isa<PoisonValue>(VL[
I]))
11433 Entries.push_back(FirstEntries.front());
11436 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
11439 for (
const TreeEntry *TE : UsedTEs.front()) {
11440 unsigned VF =
TE->getVectorFactor();
11441 auto It = VFToTE.
find(VF);
11442 if (It != VFToTE.
end()) {
11443 if (It->second->Idx >
TE->Idx)
11444 It->getSecond() =
TE;
11451 UsedTEs.back().
end());
11452 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
11453 return TE1->Idx < TE2->Idx;
11455 for (
const TreeEntry *TE : SecondEntries) {
11456 auto It = VFToTE.
find(
TE->getVectorFactor());
11457 if (It != VFToTE.
end()) {
11459 Entries.push_back(It->second);
11460 Entries.push_back(TE);
11466 if (Entries.empty()) {
11468 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
11469 return TE1->Idx < TE2->Idx;
11471 Entries.push_back(SecondEntries.front());
11472 VF = std::max(Entries.front()->getVectorFactor(),
11473 Entries.back()->getVectorFactor());
11477 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
11480 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
11481 auto *
PHI = cast<PHINode>(V);
11482 auto *PHI1 = cast<PHINode>(V1);
11487 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
11489 Value *In1 = PHI1->getIncomingValue(
I);
11494 if (cast<Instruction>(In)->
getParent() !=
11504 auto MightBeIgnored = [=](
Value *
V) {
11505 auto *
I = dyn_cast<Instruction>(V);
11506 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
11508 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
11513 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
11515 bool UsedInSameVTE =
false;
11516 auto It = UsedValuesEntry.
find(V1);
11517 if (It != UsedValuesEntry.
end())
11518 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
11519 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11521 cast<Instruction>(V)->getParent() ==
11522 cast<Instruction>(V1)->getParent() &&
11523 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11528 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11530 auto It = UsedValuesEntry.
find(V);
11531 if (It == UsedValuesEntry.
end())
11537 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
11538 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
11540 unsigned Idx = It->second;
11547 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
11548 if (!UsedIdxs.test(
I))
11554 for (std::pair<unsigned, int> &Pair : EntryLanes)
11555 if (Pair.first ==
I)
11556 Pair.first = TempEntries.
size();
11559 Entries.swap(TempEntries);
11560 if (EntryLanes.size() == Entries.size() &&
11562 .
slice(Part * VL.size(),
11563 std::min<int>(VL.size(),
TE->Scalars.size())))) {
11569 return std::nullopt;
11572 bool IsIdentity = Entries.size() == 1;
11575 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
11576 unsigned Idx = Part * VL.size() + Pair.second;
11579 (ForOrder ? std::distance(
11580 Entries[Pair.first]->Scalars.begin(),
11581 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11582 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11583 IsIdentity &=
Mask[
Idx] == Pair.second;
11585 switch (Entries.size()) {
11587 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11591 if (EntryLanes.size() > 2 || VL.size() <= 2)
11599 std::fill(std::next(
Mask.begin(), Part * VL.size()),
11601 return std::nullopt;
11605BoUpSLP::isGatherShuffledEntry(
11609 assert(NumParts > 0 && NumParts < VL.
size() &&
11610 "Expected positive number of registers.");
11613 if (TE == VectorizableTree.front().get())
11616 if (
TE->isNonPowOf2Vec())
11619 assert(
TE->UserTreeIndices.size() == 1 &&
11620 "Expected only single user of the gather node.");
11622 "Number of scalars must be divisible by NumParts.");
11625 for (
unsigned Part : seq<unsigned>(NumParts)) {
11629 std::optional<TTI::ShuffleKind> SubRes =
11630 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11633 SubEntries.
clear();
11636 SubEntries.
front()->getVectorFactor() == VL.
size() &&
11637 (SubEntries.
front()->isSame(
TE->Scalars) ||
11638 SubEntries.
front()->isSame(VL))) {
11640 LocalSubEntries.
swap(SubEntries);
11643 std::iota(
Mask.begin(),
Mask.end(), 0);
11645 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
11646 if (isa<PoisonValue>(VL[
I]))
11648 Entries.emplace_back(1, LocalSubEntries.
front());
11654 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
11662 Type *ScalarTy)
const {
11664 bool DuplicateNonConst =
false;
11673 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
11674 if (
V->getType() != ScalarTy) {
11685 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
11688 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
11689 ShuffledElements.
setBits(
I * ScalarTyNumElements,
11690 I * ScalarTyNumElements + ScalarTyNumElements);
11697 EstimateInsertCost(
I, V);
11698 ShuffleMask[
I] =
I;
11702 DuplicateNonConst =
true;
11703 ShuffledElements.
setBits(
I * ScalarTyNumElements,
11704 I * ScalarTyNumElements + ScalarTyNumElements);
11705 ShuffleMask[
I] = Res.first->second;
11711 if (DuplicateNonConst)
11713 VecTy, ShuffleMask);
11725 VLOperands Ops(VL, R);
11728 Left = Ops.getVL(0);
11729 Right = Ops.getVL(1);
11732Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
11735 return *Res.second;
11739 auto *Front = E->getMainOp();
11742 if (E->getOpcode() == Instruction::GetElementPtr &&
11743 !isa<GetElementPtrInst>(V))
11745 auto *I = cast<Instruction>(V);
11746 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11747 isVectorLikeInstWithConstOps(I);
11750 auto FindLastInst = [&]() {
11752 for (
Value *V : E->Scalars) {
11753 auto *
I = dyn_cast<Instruction>(V);
11756 if (LastInst->
getParent() ==
I->getParent()) {
11761 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11762 !isa<GetElementPtrInst>(
I)) ||
11765 "Expected vector-like or non-GEP in GEP node insts only.");
11773 auto *NodeB = DT->
getNode(
I->getParent());
11774 assert(NodeA &&
"Should only process reachable instructions");
11775 assert(NodeB &&
"Should only process reachable instructions");
11776 assert((NodeA == NodeB) ==
11777 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11778 "Different nodes should have different DFS numbers");
11779 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11786 auto FindFirstInst = [&]() {
11788 for (
Value *V : E->Scalars) {
11789 auto *
I = dyn_cast<Instruction>(V);
11792 if (FirstInst->
getParent() ==
I->getParent()) {
11793 if (
I->comesBefore(FirstInst))
11797 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11798 !isa<GetElementPtrInst>(
I)) ||
11801 "Expected vector-like or non-GEP in GEP node insts only.");
11809 auto *NodeB = DT->
getNode(
I->getParent());
11810 assert(NodeA &&
"Should only process reachable instructions");
11811 assert(NodeB &&
"Should only process reachable instructions");
11812 assert((NodeA == NodeB) ==
11813 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11814 "Different nodes should have different DFS numbers");
11815 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11825 if ((E->getOpcode() == Instruction::GetElementPtr &&
11828 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11832 return !isVectorLikeInstWithConstOps(V) &&
11833 isUsedOutsideBlock(V);
11835 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
11836 return isa<ExtractElementInst, UndefValue>(V) ||
11837 areAllOperandsNonInsts(V);
11839 Res.second = FindLastInst();
11841 Res.second = FindFirstInst();
11842 return *Res.second;
11849 if (BlocksSchedules.count(BB)) {
11850 Value *
V = E->isOneOf(E->Scalars.back());
11853 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11854 if (Bundle && Bundle->isPartOfBundle())
11855 for (; Bundle; Bundle = Bundle->NextInBundle)
11856 Res.second = Bundle->Inst;
11878 Res.second = FindLastInst();
11879 assert(Res.second &&
"Failed to find last instruction in bundle");
11880 return *Res.second;
11883void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11884 auto *Front = E->getMainOp();
11885 Instruction *LastInst = &getLastInstructionInBundle(E);
11886 assert(LastInst &&
"Failed to find last instruction in bundle");
11889 bool IsPHI = isa<PHINode>(LastInst);
11891 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
11893 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11897 Builder.SetInsertPoint(
11901 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11911 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11914 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11915 InsertBB = InsertBB->getSinglePredecessor();
11916 return InsertBB && InsertBB == InstBB;
11918 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11919 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11920 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11921 getTreeEntry(Inst) ||
11922 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11923 PostponedIndices.
insert(
I).second)
11927 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11930 if (
Scalar->getType() != Ty) {
11934 if (
auto *CI = dyn_cast<CastInst>(Scalar);
11935 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11937 if (
auto *IOp = dyn_cast<Instruction>(
Op);
11938 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
11941 Scalar = Builder.CreateIntCast(
11946 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
11948 Vec = InsElt = Builder.CreateInsertVector(
11951 auto *
II = dyn_cast<IntrinsicInst>(InsElt);
11952 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
11955 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11956 InsElt = dyn_cast<InsertElementInst>(Vec);
11960 GatherShuffleExtractSeq.
insert(InsElt);
11963 if (isa<Instruction>(V)) {
11964 if (TreeEntry *Entry = getTreeEntry(V)) {
11966 User *UserOp =
nullptr;
11968 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11974 unsigned FoundLane =
Entry->findLaneForValue(V);
11975 ExternalUses.emplace_back(V, UserOp, FoundLane);
11985 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11993 if (!isa<UndefValue>(VL[
I])) {
11997 if (isa<PoisonValue>(VL[
I]))
11999 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
12004 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
12007 for (
int I : NonConsts)
12008 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
12011 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
12012 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
12050 bool IsFinalized =
false;
12063 class ShuffleIRBuilder {
12076 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
12077 CSEBlocks(CSEBlocks),
DL(
DL) {}
12078 ~ShuffleIRBuilder() =
default;
12081 if (V1->
getType() != V2->getType()) {
12084 "Expected integer vector types only.");
12085 if (V1->
getType() != V2->getType()) {
12086 if (cast<VectorType>(V2->getType())
12088 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
12090 ->getIntegerBitWidth())
12099 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
12100 GatherShuffleExtractSeq.
insert(
I);
12101 CSEBlocks.
insert(
I->getParent());
12110 unsigned VF = Mask.size();
12111 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
12115 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
12116 GatherShuffleExtractSeq.
insert(
I);
12117 CSEBlocks.
insert(
I->getParent());
12121 Value *createIdentity(
Value *V) {
return V; }
12122 Value *createPoison(
Type *Ty,
unsigned VF) {
12127 void resizeToMatch(
Value *&V1,
Value *&V2) {
12128 if (V1->
getType() == V2->getType())
12130 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
12131 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
12132 int VF = std::max(V1VF, V2VF);
12133 int MinVF = std::min(V1VF, V2VF);
12135 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
12137 Value *&
Op = MinVF == V1VF ? V1 : V2;
12139 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
12140 GatherShuffleExtractSeq.
insert(
I);
12141 CSEBlocks.
insert(
I->getParent());
12154 assert(V1 &&
"Expected at least one vector value.");
12155 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
12156 R.CSEBlocks, *R.DL);
12157 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
12165 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12173 std::optional<bool> IsSigned = std::nullopt) {
12174 auto *VecTy = cast<VectorType>(V->getType());
12185 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
12189 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
12190 unsigned NumParts,
bool &UseVecBaseAsInput) {
12191 UseVecBaseAsInput =
false;
12193 Value *VecBase =
nullptr;
12194 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
12198 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12199 VecBase = EI->getVectorOperand();
12200 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
12201 VecBase = TE->VectorizedValue;
12202 assert(VecBase &&
"Expected vectorized value.");
12203 UniqueBases.
insert(VecBase);
12206 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
12208 const TreeEntry *UTE = R.getTreeEntry(U);
12209 return !UTE || R.MultiNodeScalars.contains(U) ||
12210 (isa<GetElementPtrInst>(U) &&
12211 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
12212 count_if(R.VectorizableTree,
12213 [&](const std::unique_ptr<TreeEntry> &TE) {
12214 return any_of(TE->UserTreeIndices,
12215 [&](const EdgeInfo &Edge) {
12216 return Edge.UserTE == UTE;
12218 is_contained(TE->Scalars, EI);
12222 R.eraseInstruction(EI);
12224 if (NumParts == 1 || UniqueBases.
size() == 1) {
12225 assert(VecBase &&
"Expected vectorized value.");
12226 return castToScalarTyElem(VecBase);
12228 UseVecBaseAsInput =
true;
12238 Value *Vec =
nullptr;
12241 for (
unsigned Part : seq<unsigned>(NumParts)) {
12242 unsigned Limit =
getNumElems(E->Scalars.size(), SliceSize, Part);
12246 constexpr int MaxBases = 2;
12248 auto VLMask =
zip(VL, SubMask);
12249 const unsigned VF = std::accumulate(
12250 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
12251 if (std::get<1>(D) == PoisonMaskElem)
12254 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
12255 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
12256 VecOp = TE->VectorizedValue;
12257 assert(VecOp &&
"Expected vectorized value.");
12258 const unsigned Size =
12259 cast<FixedVectorType>(VecOp->getType())->getNumElements();
12260 return std::max(S, Size);
12262 for (
const auto [V,
I] : VLMask) {
12265 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
12266 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
12267 VecOp = TE->VectorizedValue;
12268 assert(VecOp &&
"Expected vectorized value.");
12269 VecOp = castToScalarTyElem(VecOp);
12270 Bases[
I / VF] = VecOp;
12272 if (!Bases.front())
12275 if (Bases.back()) {
12276 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
12277 TransformToIdentity(SubMask);
12279 SubVec = Bases.front();
12286 Mask.slice(
P * SliceSize,
12293 "Expected first part or all previous parts masked.");
12294 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
12297 cast<FixedVectorType>(Vec->
getType())->getNumElements();
12299 unsigned SubVecVF =
12300 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
12301 NewVF = std::max(NewVF, SubVecVF);
12304 for (
int &
Idx : SubMask)
12307 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
12308 Vec = createShuffle(Vec, SubVec, VecMask);
12309 TransformToIdentity(VecMask);
12317 std::optional<Value *>
12323 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
12325 return std::nullopt;
12328 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
12337 Value *V1 = E1.VectorizedValue;
12339 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
12340 return !isKnownNonNegative(
12341 V, SimplifyQuery(*R.DL));
12343 Value *V2 = E2.VectorizedValue;
12344 if (V2->getType()->isIntOrIntVectorTy())
12345 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
12346 return !isKnownNonNegative(
12347 V, SimplifyQuery(*R.DL));
12354 Value *V1 = E1.VectorizedValue;
12356 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
12357 return !isKnownNonNegative(
12358 V, SimplifyQuery(*R.DL));
12364 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
12366 isa<FixedVectorType>(V2->getType()) &&
12367 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
12368 V1 = castToScalarTyElem(V1);
12369 V2 = castToScalarTyElem(V2);
12370 if (InVectors.
empty()) {
12373 CommonMask.
assign(Mask.begin(), Mask.end());
12377 if (InVectors.
size() == 2) {
12378 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
12379 transformMaskAfterShuffle(CommonMask, CommonMask);
12380 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
12382 Vec = createShuffle(Vec,
nullptr, CommonMask);
12383 transformMaskAfterShuffle(CommonMask, CommonMask);
12385 V1 = createShuffle(V1, V2, Mask);
12386 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12388 CommonMask[
Idx] =
Idx + Sz;
12389 InVectors.
front() = Vec;
12390 if (InVectors.
size() == 2)
12391 InVectors.
back() = V1;
12398 "castToScalarTyElem expects V1 to be FixedVectorType");
12399 V1 = castToScalarTyElem(V1);
12400 if (InVectors.
empty()) {
12402 CommonMask.
assign(Mask.begin(), Mask.end());
12405 const auto *It =
find(InVectors, V1);
12406 if (It == InVectors.
end()) {
12407 if (InVectors.
size() == 2 ||
12410 if (InVectors.
size() == 2) {
12411 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
12412 transformMaskAfterShuffle(CommonMask, CommonMask);
12413 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
12414 CommonMask.
size()) {
12415 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
12416 transformMaskAfterShuffle(CommonMask, CommonMask);
12418 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12421 V->getType() != V1->
getType()
12423 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
12424 ->getNumElements();
12425 if (V->getType() != V1->
getType())
12426 V1 = createShuffle(V1,
nullptr, Mask);
12427 InVectors.
front() = V;
12428 if (InVectors.
size() == 2)
12429 InVectors.
back() = V1;
12436 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12442 int VF = getVF(V1);
12443 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12445 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
12454 Value *Root =
nullptr) {
12455 return R.gather(VL, Root, ScalarTy);
12464 IsFinalized =
true;
12466 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
12472 ExtMask = NewExtMask;
12476 if (InVectors.
size() == 2) {
12477 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
12480 Vec = createShuffle(Vec,
nullptr, CommonMask);
12482 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
12486 "Expected vector length for the final value before action.");
12487 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
12490 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
12491 Vec = createShuffle(Vec,
nullptr, ResizeMask);
12493 Action(Vec, CommonMask);
12494 InVectors.
front() = Vec;
12496 if (!ExtMask.
empty()) {
12497 if (CommonMask.
empty()) {
12501 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12504 NewMask[
I] = CommonMask[ExtMask[
I]];
12506 CommonMask.
swap(NewMask);
12509 if (CommonMask.
empty()) {
12510 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
12511 return InVectors.
front();
12513 if (InVectors.
size() == 2)
12514 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
12515 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
12520 "Shuffle construction must be finalized.");
12524Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
12525 bool PostponedPHIs) {
12526 ValueList &VL = E->getOperand(NodeIdx);
12527 const unsigned VF = VL.size();
12530 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12531 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
12532 if (It != VL.end())
12535 if (S.getOpcode()) {
12536 auto CheckSameVE = [&](
const TreeEntry *VE) {
12537 return VE->isSame(VL) &&
12538 (
any_of(VE->UserTreeIndices,
12539 [E, NodeIdx](
const EdgeInfo &EI) {
12540 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12542 any_of(VectorizableTree,
12543 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
12544 return TE->isOperandGatherNode({E, NodeIdx}) &&
12545 VE->isSame(TE->Scalars);
12548 TreeEntry *VE = getTreeEntry(S.OpValue);
12549 bool IsSameVE = VE && CheckSameVE(VE);
12551 auto It = MultiNodeScalars.
find(S.OpValue);
12552 if (It != MultiNodeScalars.
end()) {
12553 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
12554 return TE != VE && CheckSameVE(TE);
12556 if (
I != It->getSecond().end()) {
12567 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
12569 ShuffleInstructionBuilder ShuffleBuilder(
12573 ShuffleBuilder.add(V, Mask);
12574 return ShuffleBuilder.finalize(std::nullopt);
12578 cast<FixedVectorType>(
V->getType())->getNumElements()) {
12579 if (!VE->ReuseShuffleIndices.empty()) {
12600 if (isa<PoisonValue>(V))
12602 Mask[
I] = VE->findLaneForValue(V);
12604 V = FinalShuffle(V, Mask);
12606 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
12607 "Expected vectorization factor less "
12608 "than original vector size.");
12610 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12611 V = FinalShuffle(V, UniformMask);
12617 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
12618 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12619 }) == VE->UserTreeIndices.end()) {
12621 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12622 return TE->isGather() &&
12623 TE->UserTreeIndices.front().UserTE == E &&
12624 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12626 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
12627 (*It)->VectorizedValue =
V;
12636 auto *
I =
find_if(VectorizableTree,
12637 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
12638 return TE->isOperandGatherNode({E, NodeIdx});
12640 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
12641 assert(
I->get()->UserTreeIndices.size() == 1 &&
12642 "Expected only single user for the gather node.");
12643 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
12647template <
typename BVTy,
typename ResTy,
typename...
Args>
12648ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
12650 assert(E->isGather() &&
"Expected gather node.");
12651 unsigned VF = E->getVectorFactor();
12653 bool NeedFreeze =
false;
12655 E->ReuseShuffleIndices.end());
12661 if (!ReorderMask.
empty())
12664 unsigned I,
unsigned SliceSize) {
12666 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12669 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12670 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12671 if (UserTE->getNumOperands() != 2)
12674 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
12675 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
12676 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12677 }) !=
TE->UserTreeIndices.end();
12679 if (It == VectorizableTree.end())
12682 if ((
Mask.size() < InputVF &&
12685 (
Mask.size() == InputVF &&
12688 std::next(
Mask.begin(),
I * SliceSize),
12689 std::next(
Mask.begin(),
12696 std::next(
Mask.begin(),
I * SliceSize),
12697 std::next(
Mask.begin(),
12703 BVTy ShuffleBuilder(ScalarTy, Params...);
12704 ResTy Res = ResTy();
12708 Value *ExtractVecBase =
nullptr;
12709 bool UseVecBaseAsInput =
false;
12712 Type *OrigScalarTy = GatheredScalars.front()->getType();
12715 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12717 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
12719 bool Resized =
false;
12721 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12722 if (!ExtractShuffles.
empty()) {
12727 if (
const auto *TE = getTreeEntry(
12728 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
12731 if (std::optional<ResTy> Delayed =
12732 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12734 PostponedGathers.
insert(E);
12739 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
12740 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12741 ExtractVecBase = VecBase;
12742 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12743 if (VF == VecBaseTy->getNumElements() &&
12744 GatheredScalars.size() != VF) {
12746 GatheredScalars.append(VF - GatheredScalars.size(),
12752 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
12753 E->isAltShuffle() ||
12754 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
12756 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12758 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12760 if (!GatherShuffles.
empty()) {
12761 if (std::optional<ResTy> Delayed =
12762 ShuffleBuilder.needToDelay(E, Entries)) {
12764 PostponedGathers.
insert(E);
12769 if (GatherShuffles.
size() == 1 &&
12771 Entries.front().front()->isSame(E->Scalars)) {
12774 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
12777 Mask.resize(E->Scalars.size());
12778 const TreeEntry *FrontTE = Entries.front().front();
12779 if (FrontTE->ReorderIndices.empty() &&
12780 ((FrontTE->ReuseShuffleIndices.empty() &&
12781 E->Scalars.size() == FrontTE->Scalars.size()) ||
12782 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12783 std::iota(
Mask.begin(),
Mask.end(), 0);
12786 if (isa<PoisonValue>(V)) {
12790 Mask[
I] = FrontTE->findLaneForValue(V);
12793 ShuffleBuilder.add(*FrontTE, Mask);
12794 Res = ShuffleBuilder.finalize(E->getCommonMask());
12798 if (GatheredScalars.size() != VF &&
12800 return any_of(TEs, [&](
const TreeEntry *TE) {
12801 return TE->getVectorFactor() == VF;
12804 GatheredScalars.append(VF - GatheredScalars.size(),
12808 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
12816 bool IsRootPoison) {
12819 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
12826 int NumNonConsts = 0;
12829 if (isa<UndefValue>(V)) {
12830 if (!isa<PoisonValue>(V)) {
12845 Scalars.
front() = OrigV;
12848 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
12849 Scalars[Res.first->second] = OrigV;
12850 ReuseMask[
I] = Res.first->second;
12853 if (NumNonConsts == 1) {
12858 if (!UndefPos.
empty() && UndefPos.
front() == 0)
12861 ReuseMask[SinglePos] = SinglePos;
12862 }
else if (!UndefPos.
empty() && IsSplat) {
12867 return !isa<UndefValue>(V) &&
12869 (E->UserTreeIndices.size() == 1 &&
12873 return E->UserTreeIndices.front().EdgeIdx !=
12874 U.getOperandNo() &&
12876 E->UserTreeIndices.front().UserTE->Scalars,
12880 if (It != Scalars.
end()) {
12882 int Pos = std::distance(Scalars.
begin(), It);
12883 for (
int I : UndefPos) {
12885 ReuseMask[
I] = Pos;
12894 for (
int I : UndefPos) {
12896 if (isa<UndefValue>(Scalars[
I]))
12903 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12904 bool IsNonPoisoned =
true;
12905 bool IsUsedInExpr =
true;
12906 Value *Vec1 =
nullptr;
12907 if (!ExtractShuffles.
empty()) {
12911 Value *Vec2 =
nullptr;
12912 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12916 if (UseVecBaseAsInput) {
12917 Vec1 = ExtractVecBase;
12919 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12922 if (isa<UndefValue>(E->Scalars[
I]))
12924 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12925 Value *VecOp = EI->getVectorOperand();
12926 if (
const auto *TE = getTreeEntry(VecOp))
12927 if (
TE->VectorizedValue)
12928 VecOp =
TE->VectorizedValue;
12931 }
else if (Vec1 != VecOp) {
12932 assert((!Vec2 || Vec2 == VecOp) &&
12933 "Expected only 1 or 2 vectors shuffle.");
12939 IsUsedInExpr =
false;
12942 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12944 IsUsedInExpr &= FindReusedSplat(
12946 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12947 ExtractMask.size());
12948 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12951 IsUsedInExpr =
false;
12956 if (!GatherShuffles.
empty()) {
12959 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12962 "No shuffles with empty entries list expected.");
12966 "Expected shuffle of 1 or 2 entries.");
12970 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12971 if (TEs.
size() == 1) {
12972 IsUsedInExpr &= FindReusedSplat(
12973 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12974 ShuffleBuilder.add(*TEs.
front(), VecMask);
12975 if (TEs.
front()->VectorizedValue)
12979 IsUsedInExpr =
false;
12980 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12981 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12992 int EMSz = ExtractMask.size();
12993 int MSz =
Mask.size();
12996 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12997 bool IsIdentityShuffle =
12998 ((UseVecBaseAsInput ||
13000 [](
const std::optional<TTI::ShuffleKind> &SK) {
13004 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
13006 (!GatherShuffles.
empty() &&
13008 [](
const std::optional<TTI::ShuffleKind> &SK) {
13012 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
13014 bool EnoughConstsForShuffle =
13018 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
13022 return isa<Constant>(V) && !isa<UndefValue>(V);
13024 (!IsIdentityShuffle ||
13025 (GatheredScalars.size() == 2 &&
13027 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
13029 return isa<Constant>(V) && !isa<PoisonValue>(V);
13033 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
13034 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
13040 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
13042 TryPackScalars(GatheredScalars, BVMask,
true);
13043 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
13044 ShuffleBuilder.add(BV, BVMask);
13047 return isa<PoisonValue>(V) ||
13048 (IsSingleShuffle && ((IsIdentityShuffle &&
13049 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
13051 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13053 Res = ShuffleBuilder.finalize(
13054 E->ReuseShuffleIndices, E->Scalars.size(),
13056 TryPackScalars(NonConstants, Mask,
false);
13057 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
13062 TryPackScalars(GatheredScalars, ReuseMask,
true);
13063 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
13064 ShuffleBuilder.add(BV, ReuseMask);
13065 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13069 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
13070 if (!isa<PoisonValue>(V))
13073 Value *BV = ShuffleBuilder.gather(GatheredScalars);
13074 ShuffleBuilder.add(BV, Mask);
13075 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13079 Res = ShuffleBuilder.createFreeze(Res);
13083Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
13084 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
13091 if (E->VectorizedValue &&
13092 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
13093 E->isAltShuffle())) {
13094 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
13095 return E->VectorizedValue;
13098 Value *
V = E->Scalars.front();
13099 Type *ScalarTy =
V->getType();
13100 if (
auto *Store = dyn_cast<StoreInst>(V))
13101 ScalarTy =
Store->getValueOperand()->getType();
13102 else if (
auto *IE = dyn_cast<InsertElementInst>(V))
13103 ScalarTy =
IE->getOperand(1)->getType();
13104 auto It = MinBWs.
find(E);
13105 if (It != MinBWs.
end()) {
13106 auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
13112 if (E->isGather()) {
13114 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
13115 setInsertPointAfterBundle(E);
13116 Value *Vec = createBuildVector(E, ScalarTy);
13117 E->VectorizedValue = Vec;
13122 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
13123 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
13124 if (E->getOpcode() == Instruction::Store &&
13125 E->State == TreeEntry::Vectorize) {
13127 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
13128 E->ReorderIndices.size());
13129 ShuffleBuilder.add(V, Mask);
13130 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
13131 ShuffleBuilder.addOrdered(V, std::nullopt);
13133 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
13135 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
13138 assert(!E->isGather() &&
"Unhandled state");
13139 unsigned ShuffleOrOp =
13140 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
13142 auto GetOperandSignedness = [&](
unsigned Idx) {
13143 const TreeEntry *OpE = getOperandEntry(E,
Idx);
13144 bool IsSigned =
false;
13145 auto It = MinBWs.
find(OpE);
13146 if (It != MinBWs.
end())
13147 IsSigned = It->second.second;
13150 return !isKnownNonNegative(R, SimplifyQuery(*DL));
13154 switch (ShuffleOrOp) {
13155 case Instruction::PHI: {
13156 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
13157 E != VectorizableTree.front().get() ||
13158 !E->UserTreeIndices.empty()) &&
13159 "PHI reordering is free.");
13160 if (PostponedPHIs && E->VectorizedValue)
13161 return E->VectorizedValue;
13162 auto *PH = cast<PHINode>(VL0);
13164 PH->getParent()->getFirstNonPHIIt());
13166 if (PostponedPHIs || !E->VectorizedValue) {
13173 PH->getParent()->getFirstInsertionPt());
13176 V = FinalShuffle(V, E, VecTy);
13178 E->VectorizedValue =
V;
13182 PHINode *NewPhi = cast<PHINode>(E->PHI);
13191 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13197 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13201 if (!VisitedBBs.
insert(IBB).second) {
13208 Value *Vec = vectorizeOperand(E,
I,
true);
13209 if (VecTy != Vec->
getType()) {
13211 MinBWs.
contains(getOperandEntry(E,
I))) &&
13212 "Expected item in MinBWs.");
13213 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
13219 "Invalid number of incoming values");
13223 case Instruction::ExtractElement: {
13224 Value *
V = E->getSingleOperand(0);
13225 if (
const TreeEntry *TE = getTreeEntry(V))
13226 V =
TE->VectorizedValue;
13227 setInsertPointAfterBundle(E);
13228 V = FinalShuffle(V, E, VecTy);
13229 E->VectorizedValue =
V;
13232 case Instruction::ExtractValue: {
13233 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
13238 NewV = FinalShuffle(NewV, E, VecTy);
13239 E->VectorizedValue = NewV;
13242 case Instruction::InsertElement: {
13243 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
13245 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
13247 Type *ScalarTy =
Op.front()->getType();
13248 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
13250 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
13251 assert(Res.first > 0 &&
"Expected item in MinBWs.");
13256 cast<FixedVectorType>(
V->getType())->getNumElements()),
13261 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
13262 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
13264 const unsigned NumElts =
13265 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
13266 const unsigned NumScalars = E->Scalars.size();
13269 assert(
Offset < NumElts &&
"Failed to find vector index offset");
13273 if (!E->ReorderIndices.empty()) {
13278 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
13281 bool IsIdentity =
true;
13283 Mask.swap(PrevMask);
13284 for (
unsigned I = 0;
I < NumScalars; ++
I) {
13287 IsIdentity &= InsertIdx -
Offset ==
I;
13290 if (!IsIdentity || NumElts != NumScalars) {
13294 if (NumElts != NumScalars &&
Offset == 0) {
13303 InsertMask[*InsertIdx] = *InsertIdx;
13304 if (!
Ins->hasOneUse())
13306 Ins = dyn_cast_or_null<InsertElementInst>(
13307 Ins->getUniqueUndroppableUser());
13310 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13312 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13315 if (!IsFirstPoison.
all()) {
13317 for (
unsigned I = 0;
I < NumElts;
I++) {
13319 IsFirstUndef.
test(
I)) {
13320 if (IsVNonPoisonous) {
13321 InsertMask[
I] =
I < NumScalars ?
I : 0;
13326 if (
Idx >= NumScalars)
13327 Idx = NumScalars - 1;
13328 InsertMask[
I] = NumScalars +
Idx;
13342 if (
auto *
I = dyn_cast<Instruction>(V)) {
13343 GatherShuffleExtractSeq.
insert(
I);
13344 CSEBlocks.
insert(
I->getParent());
13349 for (
unsigned I = 0;
I < NumElts;
I++) {
13354 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
13357 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
13358 NumElts != NumScalars) {
13359 if (IsFirstUndef.
all()) {
13362 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13363 if (!IsFirstPoison.
all()) {
13364 for (
unsigned I = 0;
I < NumElts;
I++) {
13366 InsertMask[
I] =
I + NumElts;
13373 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
13374 if (
auto *
I = dyn_cast<Instruction>(V)) {
13375 GatherShuffleExtractSeq.
insert(
I);
13376 CSEBlocks.
insert(
I->getParent());
13381 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
13382 for (
unsigned I = 0;
I < NumElts;
I++) {
13386 InsertMask[
I] += NumElts;
13389 FirstInsert->getOperand(0), V, InsertMask,
13390 cast<Instruction>(E->Scalars.back())->getName());
13391 if (
auto *
I = dyn_cast<Instruction>(V)) {
13392 GatherShuffleExtractSeq.
insert(
I);
13393 CSEBlocks.
insert(
I->getParent());
13398 ++NumVectorInstructions;
13399 E->VectorizedValue =
V;
13402 case Instruction::ZExt:
13403 case Instruction::SExt:
13404 case Instruction::FPToUI:
13405 case Instruction::FPToSI:
13406 case Instruction::FPExt:
13407 case Instruction::PtrToInt:
13408 case Instruction::IntToPtr:
13409 case Instruction::SIToFP:
13410 case Instruction::UIToFP:
13411 case Instruction::Trunc:
13412 case Instruction::FPTrunc:
13413 case Instruction::BitCast: {
13414 setInsertPointAfterBundle(E);
13416 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
13417 if (E->VectorizedValue) {
13418 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13419 return E->VectorizedValue;
13422 auto *CI = cast<CastInst>(VL0);
13424 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
13425 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
13427 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
13430 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
13431 if (SrcIt != MinBWs.
end())
13432 SrcBWSz = SrcIt->second.first;
13434 if (BWSz == SrcBWSz) {
13435 VecOpcode = Instruction::BitCast;
13436 }
else if (BWSz < SrcBWSz) {
13437 VecOpcode = Instruction::Trunc;
13438 }
else if (It != MinBWs.
end()) {
13439 assert(BWSz > SrcBWSz &&
"Invalid cast!");
13440 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13441 }
else if (SrcIt != MinBWs.
end()) {
13442 assert(BWSz > SrcBWSz &&
"Invalid cast!");
13444 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13446 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
13447 !SrcIt->second.second) {
13448 VecOpcode = Instruction::UIToFP;
13450 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13452 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
13453 V = FinalShuffle(V, E, VecTy);
13455 E->VectorizedValue =
V;
13456 ++NumVectorInstructions;
13459 case Instruction::FCmp:
13460 case Instruction::ICmp: {
13461 setInsertPointAfterBundle(E);
13463 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
13464 if (E->VectorizedValue) {
13465 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13466 return E->VectorizedValue;
13468 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
13469 if (E->VectorizedValue) {
13470 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13471 return E->VectorizedValue;
13473 if (
L->getType() !=
R->getType()) {
13475 getOperandEntry(E, 1)->
isGather() ||
13476 MinBWs.
contains(getOperandEntry(E, 0)) ||
13477 MinBWs.
contains(getOperandEntry(E, 1))) &&
13478 "Expected item in MinBWs.");
13479 if (cast<VectorType>(
L->getType())
13481 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
13483 ->getIntegerBitWidth()) {
13484 Type *CastTy =
R->getType();
13487 Type *CastTy =
L->getType();
13496 VecTy = cast<FixedVectorType>(
V->getType());
13497 V = FinalShuffle(V, E, VecTy);
13499 E->VectorizedValue =
V;
13500 ++NumVectorInstructions;
13503 case Instruction::Select: {
13504 setInsertPointAfterBundle(E);
13506 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
13507 if (E->VectorizedValue) {
13508 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13509 return E->VectorizedValue;
13511 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13512 if (E->VectorizedValue) {
13513 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13514 return E->VectorizedValue;
13516 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13517 if (E->VectorizedValue) {
13518 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13519 return E->VectorizedValue;
13523 getOperandEntry(E, 2)->
isGather() ||
13524 MinBWs.
contains(getOperandEntry(E, 1)) ||
13525 MinBWs.
contains(getOperandEntry(E, 2))) &&
13526 "Expected item in MinBWs.");
13527 if (True->
getType() != VecTy)
13528 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
13529 if (False->
getType() != VecTy)
13530 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
13535 assert(TrueNumElements >= CondNumElements &&
13536 TrueNumElements % CondNumElements == 0 &&
13537 "Cannot vectorize Instruction::Select");
13539 "Cannot vectorize Instruction::Select");
13540 if (CondNumElements != TrueNumElements) {
13548 "Cannot vectorize Instruction::Select");
13550 V = FinalShuffle(V, E, VecTy);
13552 E->VectorizedValue =
V;
13553 ++NumVectorInstructions;
13556 case Instruction::FNeg: {
13557 setInsertPointAfterBundle(E);
13559 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
13561 if (E->VectorizedValue) {
13562 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13563 return E->VectorizedValue;
13569 if (
auto *
I = dyn_cast<Instruction>(V))
13572 V = FinalShuffle(V, E, VecTy);
13574 E->VectorizedValue =
V;
13575 ++NumVectorInstructions;
13579 case Instruction::Freeze: {
13580 setInsertPointAfterBundle(E);
13582 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
13584 if (E->VectorizedValue) {
13585 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13586 return E->VectorizedValue;
13590 V = FinalShuffle(V, E, VecTy);
13592 E->VectorizedValue =
V;
13593 ++NumVectorInstructions;
13597 case Instruction::Add:
13598 case Instruction::FAdd:
13599 case Instruction::Sub:
13600 case Instruction::FSub:
13601 case Instruction::Mul:
13602 case Instruction::FMul:
13603 case Instruction::UDiv:
13604 case Instruction::SDiv:
13605 case Instruction::FDiv:
13606 case Instruction::URem:
13607 case Instruction::SRem:
13608 case Instruction::FRem:
13609 case Instruction::Shl:
13610 case Instruction::LShr:
13611 case Instruction::AShr:
13612 case Instruction::And:
13613 case Instruction::Or:
13614 case Instruction::Xor: {
13615 setInsertPointAfterBundle(E);
13617 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
13618 if (E->VectorizedValue) {
13619 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13620 return E->VectorizedValue;
13622 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
13623 if (E->VectorizedValue) {
13624 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13625 return E->VectorizedValue;
13627 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
13628 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13631 auto *CI = dyn_cast<ConstantInt>(
Op);
13632 return CI && CI->getValue().countr_one() >= It->second.first;
13634 V = FinalShuffle(
I == 0 ? RHS : LHS, E, VecTy);
13635 E->VectorizedValue =
V;
13636 ++NumVectorInstructions;
13643 getOperandEntry(E, 1)->
isGather() ||
13644 MinBWs.
contains(getOperandEntry(E, 0)) ||
13645 MinBWs.
contains(getOperandEntry(E, 1))) &&
13646 "Expected item in MinBWs.");
13657 if (
auto *
I = dyn_cast<Instruction>(V)) {
13660 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
13662 return isCommutative(cast<Instruction>(V));
13664 I->setHasNoUnsignedWrap(
false);
13667 V = FinalShuffle(V, E, VecTy);
13669 E->VectorizedValue =
V;
13670 ++NumVectorInstructions;
13674 case Instruction::Load: {
13677 setInsertPointAfterBundle(E);
13679 LoadInst *LI = cast<LoadInst>(VL0);
13682 if (E->State == TreeEntry::Vectorize) {
13684 }
else if (E->State == TreeEntry::StridedVectorize) {
13685 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13686 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13687 PO = IsReverseOrder ? PtrN : Ptr0;
13693 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
13695 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13696 DL->getTypeAllocSize(ScalarTy));
13700 return cast<LoadInst>(V)->getPointerOperand();
13703 std::optional<Value *> Stride =
13712 (IsReverseOrder ? -1 : 1) *
13713 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
13715 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13717 Intrinsic::experimental_vp_strided_load,
13718 {VecTy, PO->
getType(), StrideTy},
13720 Builder.
getInt32(E->Scalars.size())});
13726 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
13727 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13728 if (E->VectorizedValue) {
13729 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13730 return E->VectorizedValue;
13733 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13738 V = FinalShuffle(V, E, VecTy);
13739 E->VectorizedValue =
V;
13740 ++NumVectorInstructions;
13743 case Instruction::Store: {
13744 auto *
SI = cast<StoreInst>(VL0);
13746 setInsertPointAfterBundle(E);
13748 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13749 if (VecValue->
getType() != VecTy)
13751 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13752 VecValue = FinalShuffle(VecValue, E, VecTy);
13756 if (E->State == TreeEntry::Vectorize) {
13759 assert(E->State == TreeEntry::StridedVectorize &&
13760 "Expected either strided or conseutive stores.");
13761 if (!E->ReorderIndices.empty()) {
13762 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13763 Ptr =
SI->getPointerOperand();
13765 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13766 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
13768 Intrinsic::experimental_vp_strided_store,
13769 {VecTy,
Ptr->getType(), StrideTy},
13772 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
13774 Builder.
getInt32(E->Scalars.size())});
13783 E->VectorizedValue =
V;
13784 ++NumVectorInstructions;
13787 case Instruction::GetElementPtr: {
13788 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13789 setInsertPointAfterBundle(E);
13791 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13792 if (E->VectorizedValue) {
13793 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13794 return E->VectorizedValue;
13798 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
13799 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13800 if (E->VectorizedValue) {
13801 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13802 return E->VectorizedValue;
13807 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13808 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
13810 for (
Value *V : E->Scalars) {
13811 if (isa<GetElementPtrInst>(V))
13817 V = FinalShuffle(V, E, VecTy);
13819 E->VectorizedValue =
V;
13820 ++NumVectorInstructions;
13824 case Instruction::Call: {
13825 CallInst *CI = cast<CallInst>(VL0);
13826 setInsertPointAfterBundle(E);
13832 It != MinBWs.
end() ? It->second.first : 0);
13835 VecCallCosts.first <= VecCallCosts.second;
13837 Value *ScalarArg =
nullptr;
13843 auto *CEI = cast<CallInst>(VL0);
13844 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
13849 ScalarArg = CEI->getArgOperand(
I);
13852 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
13853 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
13861 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
13862 if (E->VectorizedValue) {
13863 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13864 return E->VectorizedValue;
13866 ScalarArg = CEI->getArgOperand(
I);
13867 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
13869 It == MinBWs.
end()) {
13872 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
13873 }
else if (It != MinBWs.
end()) {
13874 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
13883 if (!UseIntrinsic) {
13899 V = FinalShuffle(V, E, VecTy);
13901 E->VectorizedValue =
V;
13902 ++NumVectorInstructions;
13905 case Instruction::ShuffleVector: {
13907 if (
SLPReVec && !E->isAltShuffle()) {
13908 assert(E->ReuseShuffleIndices.empty() &&
13909 "Not support ReuseShuffleIndices yet.");
13910 assert(E->ReorderIndices.empty() &&
"Not support ReorderIndices yet.");
13911 setInsertPointAfterBundle(E);
13912 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
13913 if (E->VectorizedValue) {
13914 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13915 return E->VectorizedValue;
13922 assert(E->isAltShuffle() &&
13927 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13928 "Invalid Shuffle Vector Operand");
13932 setInsertPointAfterBundle(E);
13933 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13934 if (E->VectorizedValue) {
13935 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13936 return E->VectorizedValue;
13938 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13940 setInsertPointAfterBundle(E);
13941 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13943 if (E->VectorizedValue) {
13944 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13945 return E->VectorizedValue;
13952 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13953 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13954 MinBWs.
contains(getOperandEntry(E, 0)) ||
13955 MinBWs.
contains(getOperandEntry(E, 1))) &&
13956 "Expected item in MinBWs.");
13957 Type *CastTy = VecTy;
13961 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
13963 ->getIntegerBitWidth())
13980 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13981 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13982 auto *AltCI = cast<CmpInst>(E->getAltOp());
13984 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13987 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13988 cast<VectorType>(
LHS->
getType())->getElementType());
13989 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13990 if (BWSz <= SrcBWSz) {
13991 if (BWSz < SrcBWSz)
13994 "Expected same type as operand.");
13995 if (
auto *
I = dyn_cast<Instruction>(LHS))
13997 E->VectorizedValue =
LHS;
13998 ++NumVectorInstructions;
14009 for (
Value *V : {V0, V1}) {
14010 if (
auto *
I = dyn_cast<Instruction>(V)) {
14011 GatherShuffleExtractSeq.
insert(
I);
14012 CSEBlocks.
insert(
I->getParent());
14021 E->buildAltOpShuffleMask(
14023 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
14027 Mask, &OpScalars, &AltScalars);
14031 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
14033 if (
auto *
I = dyn_cast<Instruction>(Vec);
14034 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
14036 auto *IV = cast<Instruction>(V);
14037 return IV->getOpcode() == Instruction::Sub &&
14038 isCommutative(cast<Instruction>(IV));
14040 I->setHasNoUnsignedWrap(
false);
14042 DropNuwFlag(V0, E->getOpcode());
14043 DropNuwFlag(V1, E->getAltOpcode());
14045 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
14051 if (
auto *
I = dyn_cast<Instruction>(V)) {
14053 GatherShuffleExtractSeq.
insert(
I);
14054 CSEBlocks.
insert(
I->getParent());
14057 E->VectorizedValue =
V;
14058 ++NumVectorInstructions;
14071 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
14077struct ShuffledInsertData {
14090 for (
auto &BSIter : BlocksSchedules) {
14091 scheduleBlock(BSIter.second.get());
14095 EntryToLastInstruction.
clear();
14105 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
14106 if (TE->State == TreeEntry::Vectorize &&
14107 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
14108 TE->VectorizedValue)
14114 for (
const TreeEntry *E : PostponedNodes) {
14115 auto *TE =
const_cast<TreeEntry *
>(E);
14116 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
14117 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
14118 TE->UserTreeIndices.front().EdgeIdx)) &&
14119 VecTE->isSame(TE->Scalars))
14123 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
14124 TE->VectorizedValue =
nullptr;
14126 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
14135 if (isa<PHINode>(UserI)) {
14138 for (
User *U : PrevVec->users()) {
14141 auto *UI = dyn_cast<Instruction>(U);
14142 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
14144 if (UI->comesBefore(InsertPt))
14153 if (Vec->
getType() != PrevVec->getType()) {
14155 PrevVec->getType()->isIntOrIntVectorTy() &&
14156 "Expected integer vector types only.");
14157 std::optional<bool> IsSigned;
14158 for (
Value *V : TE->Scalars) {
14159 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
14160 auto It = MinBWs.
find(BaseTE);
14161 if (It != MinBWs.
end()) {
14162 IsSigned = IsSigned.value_or(
false) || It->second.second;
14166 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
14167 auto It = MinBWs.
find(MNTE);
14168 if (It != MinBWs.
end()) {
14169 IsSigned = IsSigned.value_or(
false) || It->second.second;
14174 if (IsSigned.value_or(
false))
14177 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
14178 auto It = MinBWs.
find(BVE);
14179 if (It != MinBWs.
end()) {
14180 IsSigned = IsSigned.value_or(
false) || It->second.second;
14185 if (IsSigned.value_or(
false))
14187 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
14189 IsSigned.value_or(
false) ||
14193 if (IsSigned.value_or(
false))
14197 if (IsSigned.value_or(
false)) {
14199 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
14200 if (It != MinBWs.
end())
14201 IsSigned = It->second.second;
14204 "Expected user node or perfect diamond match in MinBWs.");
14208 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
14211 auto It = PostponedValues.
find(PrevVec);
14212 if (It != PostponedValues.
end()) {
14213 for (TreeEntry *VTE : It->getSecond())
14214 VTE->VectorizedValue = Vec;
14233 for (
const auto &ExternalUse : ExternalUses) {
14234 Value *Scalar = ExternalUse.Scalar;
14241 TreeEntry *E = getTreeEntry(Scalar);
14242 assert(E &&
"Invalid scalar");
14243 assert(!E->isGather() &&
"Extracting from a gather list");
14245 if (E->getOpcode() == Instruction::GetElementPtr &&
14246 !isa<GetElementPtrInst>(Scalar))
14249 Value *Vec = E->VectorizedValue;
14250 assert(Vec &&
"Can't find vectorizable value");
14253 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
14254 if (Scalar->getType() != Vec->
getType()) {
14255 Value *Ex =
nullptr;
14256 Value *ExV =
nullptr;
14257 auto *Inst = dyn_cast<Instruction>(Scalar);
14258 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
14259 auto It = ScalarToEEs.
find(Scalar);
14260 if (It != ScalarToEEs.
end()) {
14263 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
14265 if (EEIt != It->second.end()) {
14266 Value *PrevV = EEIt->second.first;
14267 if (
auto *
I = dyn_cast<Instruction>(PrevV);
14268 I && !ReplaceInst &&
14273 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
14277 ExV = EEIt->second.second ? EEIt->second.second : Ex;
14285 auto *CloneInst = Inst->clone();
14286 CloneInst->insertBefore(Inst);
14287 if (Inst->hasName())
14290 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14291 ES && isa<Instruction>(Vec)) {
14292 Value *V = ES->getVectorOperand();
14293 auto *IVec = cast<Instruction>(Vec);
14294 if (
const TreeEntry *ETE = getTreeEntry(V))
14295 V = ETE->VectorizedValue;
14296 if (
auto *
IV = dyn_cast<Instruction>(V);
14297 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
14298 IV->comesBefore(IVec))
14302 }
else if (
auto *VecTy =
14303 dyn_cast<FixedVectorType>(Scalar->getType())) {
14312 Vec, Builder.
getInt64(ExternalUse.Lane * VecTyNumElements));
14319 if (Scalar->getType() != Ex->
getType())
14321 MinBWs.
find(E)->second.second);
14322 auto *
I = dyn_cast<Instruction>(Ex);
14324 : &
F->getEntryBlock(),
14325 std::make_pair(Ex, ExV));
14329 if (
auto *ExI = dyn_cast<Instruction>(Ex);
14331 GatherShuffleExtractSeq.
insert(ExI);
14332 CSEBlocks.
insert(ExI->getParent());
14336 assert(isa<FixedVectorType>(Scalar->getType()) &&
14337 isa<InsertElementInst>(Scalar) &&
14338 "In-tree scalar of vector type is not insertelement?");
14339 auto *IE = cast<InsertElementInst>(Scalar);
14347 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
14351 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
14354 if (ExternalUsesAsOriginalScalar.contains(U))
14356 TreeEntry *UseEntry = getTreeEntry(U);
14358 (UseEntry->State == TreeEntry::Vectorize ||
14360 TreeEntry::StridedVectorize) &&
14361 (E->State == TreeEntry::Vectorize ||
14362 E->State == TreeEntry::StridedVectorize) &&
14363 doesInTreeUserNeedToExtract(
14365 cast<Instruction>(UseEntry->Scalars.front()),
14368 "Scalar with nullptr User must be registered in "
14369 "ExternallyUsedValues map or remain as scalar in vectorized "
14371 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
14372 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
14373 if (
PHI->getParent()->isLandingPad())
14377 PHI->getParent()->getLandingPadInst()->getIterator()));
14380 PHI->getParent()->getFirstNonPHIIt());
14383 std::next(VecI->getIterator()));
14388 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14390 Scalar->replaceAllUsesWith(NewInst);
14391 ReplacedExternals.emplace_back(Scalar, NewInst);
14395 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
14398 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
14399 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
14400 if (!UsedInserts.
insert(VU).second)
14403 auto BWIt = MinBWs.
find(E);
14405 auto *ScalarTy = FTy->getElementType();
14406 auto Key = std::make_pair(Vec, ScalarTy);
14407 auto VecIt = VectorCasts.
find(Key);
14408 if (VecIt == VectorCasts.
end()) {
14410 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
14411 if (IVec->getParent()->isLandingPad())
14413 std::next(IVec->getParent()
14414 ->getLandingPadInst()
14418 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14419 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
14426 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
14427 BWIt->second.second);
14430 Vec = VecIt->second;
14437 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
14444 unsigned Idx = *InsertIdx;
14445 if (It == ShuffledInserts.
end()) {
14447 It = std::next(ShuffledInserts.
begin(),
14448 ShuffledInserts.
size() - 1);
14454 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
14455 if (IEBase !=
User &&
14456 (!IEBase->hasOneUse() ||
14460 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
14462 IEBase = cast<InsertElementInst>(
Base);
14465 "InsertElementInstruction used already.");
14466 Mask[IEIdx] = IEIdx;
14467 Base = IEBase->getOperand(0);
14468 }
while (E == getTreeEntry(
Base));
14471 Base = cast<InsertElementInst>(
Base)->getOperand(0);
14475 auto It = VectorToInsertElement.
find(
Base);
14476 if (It != VectorToInsertElement.
end())
14483 Mask[
Idx] = ExternalUse.Lane;
14484 It->InsertElements.push_back(cast<InsertElementInst>(
User));
14493 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
14495 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14496 if (PH->getIncomingValue(
I) == Scalar) {
14498 PH->getIncomingBlock(
I)->getTerminator();
14499 if (isa<CatchSwitchInst>(IncomingTerminator)) {
14501 std::next(VecI->getIterator()));
14505 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14506 PH->setOperand(
I, NewInst);
14511 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14516 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14526 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14527 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
14529 CombinedMask1[
I] = Mask[
I];
14531 CombinedMask2[
I] = Mask[
I] - VF;
14534 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
14535 ShuffleBuilder.
add(V1, CombinedMask1);
14537 ShuffleBuilder.
add(V2, CombinedMask2);
14538 return ShuffleBuilder.
finalize(std::nullopt);
14542 bool ForSingleMask) {
14543 unsigned VF = Mask.size();
14544 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14546 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
14547 Vec = CreateShuffle(Vec,
nullptr, Mask);
14548 return std::make_pair(Vec,
true);
14550 if (!ForSingleMask) {
14552 for (
unsigned I = 0;
I < VF; ++
I) {
14554 ResizeMask[Mask[
I]] = Mask[
I];
14556 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
14560 return std::make_pair(Vec,
false);
14564 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
14570 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
14571 Value *NewInst = performExtractsShuffleAction<Value>(
14575 return cast<VectorType>(Vec->getType())
14576 ->getElementCount()
14577 .getKnownMinValue();
14582 assert((Vals.size() == 1 || Vals.size() == 2) &&
14583 "Expected exactly 1 or 2 input values.");
14584 if (Vals.size() == 1) {
14587 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
14588 ->getNumElements() ||
14589 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
14590 return CreateShuffle(Vals.front(), nullptr, Mask);
14591 return Vals.front();
14593 return CreateShuffle(Vals.
front() ? Vals.
front()
14595 Vals.
back(), Mask);
14597 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
14600 if (It != ShuffledInserts[
I].InsertElements.
rend())
14603 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
14604 assert(
II &&
"Must be an insertelement instruction.");
14609 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
14612 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
14613 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
14614 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
14615 II->moveAfter(NewI);
14618 LastInsert->replaceAllUsesWith(NewInst);
14620 IE->replaceUsesOfWith(IE->getOperand(0),
14622 IE->replaceUsesOfWith(IE->getOperand(1),
14626 CSEBlocks.
insert(LastInsert->getParent());
14631 for (
auto &TEPtr : VectorizableTree) {
14632 TreeEntry *Entry = TEPtr.get();
14635 if (Entry->isGather())
14638 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
14641 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14642 Value *Scalar = Entry->Scalars[Lane];
14644 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14645 !isa<GetElementPtrInst>(Scalar))
14648 Type *Ty = Scalar->getType();
14650 for (
User *U : Scalar->users()) {
14654 assert((getTreeEntry(U) ||
14655 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14656 (isa_and_nonnull<Instruction>(U) &&
14657 isDeleted(cast<Instruction>(U)))) &&
14658 "Deleting out-of-tree value");
14662 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
14663 auto *
I = cast<Instruction>(Scalar);
14670 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14671 V->mergeDIAssignID(RemovedInsts);
14674 if (UserIgnoreList) {
14676 if (getTreeEntry(
I)->
Idx != 0)
14681 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
14682 (match(U.getUser(), m_LogicalAnd()) ||
14683 match(U.getUser(), m_LogicalOr())) &&
14684 U.getOperandNo() == 0;
14685 if (IsPoisoningLogicalOp) {
14686 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
14689 return UserIgnoreList->contains(
U.getUser());
14701 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
14704 InstrElementSize.
clear();
14706 const TreeEntry &RootTE = *VectorizableTree.front();
14707 Value *Vec = RootTE.VectorizedValue;
14708 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14709 It != MinBWs.end() &&
14710 ReductionBitWidth != It->second.first) {
14713 ReductionRoot->getIterator());
14717 cast<VectorType>(Vec->
getType())->getElementCount()),
14718 It->second.second);
14725 <<
" gather sequences instructions.\n");
14732 Loop *L = LI->getLoopFor(
I->getParent());
14737 BasicBlock *PreHeader = L->getLoopPreheader();
14745 auto *OpI = dyn_cast<Instruction>(V);
14746 return OpI && L->contains(OpI);
14752 CSEBlocks.
insert(PreHeader);
14767 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
14768 "Different nodes should have different DFS numbers");
14769 return A->getDFSNumIn() <
B->getDFSNumIn();
14779 if (I1->getType() != I2->getType())
14781 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14782 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14784 return I1->isIdenticalTo(I2);
14785 if (SI1->isIdenticalTo(SI2))
14787 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
14788 if (SI1->getOperand(
I) != SI2->getOperand(
I))
14791 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14795 unsigned LastUndefsCnt = 0;
14796 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
14802 NewMask[
I] != SM1[
I])
14805 NewMask[
I] = SM1[
I];
14809 return SM1.
size() - LastUndefsCnt > 1 &&
14813 SM1.
size() - LastUndefsCnt));
14819 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
14822 "Worklist not sorted properly!");
14828 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14829 !GatherShuffleExtractSeq.contains(&In))
14834 bool Replaced =
false;
14837 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14838 DT->
dominates(V->getParent(), In.getParent())) {
14839 In.replaceAllUsesWith(V);
14841 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
14842 if (!NewMask.
empty())
14843 SI->setShuffleMask(NewMask);
14847 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14848 GatherShuffleExtractSeq.contains(V) &&
14849 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14850 DT->
dominates(In.getParent(), V->getParent())) {
14852 V->replaceAllUsesWith(&In);
14854 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14855 if (!NewMask.
empty())
14856 SI->setShuffleMask(NewMask);
14864 Visited.push_back(&In);
14869 GatherShuffleExtractSeq.clear();
14872BoUpSLP::ScheduleData *
14874 ScheduleData *Bundle =
nullptr;
14875 ScheduleData *PrevInBundle =
nullptr;
14876 for (
Value *V : VL) {
14879 ScheduleData *BundleMember = getScheduleData(V);
14881 "no ScheduleData for bundle member "
14882 "(maybe not in same basic block)");
14883 assert(BundleMember->isSchedulingEntity() &&
14884 "bundle member already part of other bundle");
14885 if (PrevInBundle) {
14886 PrevInBundle->NextInBundle = BundleMember;
14888 Bundle = BundleMember;
14892 BundleMember->FirstInBundle = Bundle;
14893 PrevInBundle = BundleMember;
14895 assert(Bundle &&
"Failed to find schedule bundle");
14901std::optional<BoUpSLP::ScheduleData *>
14903 const InstructionsState &S) {
14914 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
14915 ScheduleData *Bundle) {
14921 if (ScheduleEnd != OldScheduleEnd) {
14922 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
14923 if (ScheduleData *SD = getScheduleData(
I))
14924 SD->clearDependencies();
14929 <<
" in block " << BB->
getName() <<
"\n");
14930 calculateDependencies(Bundle,
true, SLP);
14935 initialFillReadyList(ReadyInsts);
14942 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14943 !ReadyInsts.empty()) {
14944 ScheduleData *Picked = ReadyInsts.pop_back_val();
14945 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14946 "must be ready to schedule");
14947 schedule(Picked, ReadyInsts);
14953 for (
Value *V : VL) {
14956 if (!extendSchedulingRegion(V, S)) {
14963 TryScheduleBundleImpl(
false,
nullptr);
14964 return std::nullopt;
14968 bool ReSchedule =
false;
14969 for (
Value *V : VL) {
14972 ScheduleData *BundleMember = getScheduleData(V);
14974 "no ScheduleData for bundle member (maybe not in same basic block)");
14978 ReadyInsts.remove(BundleMember);
14980 if (!BundleMember->IsScheduled)
14985 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
14986 <<
" was already scheduled\n");
14990 auto *Bundle = buildBundle(VL);
14991 TryScheduleBundleImpl(ReSchedule, Bundle);
14992 if (!Bundle->isReady()) {
14993 cancelScheduling(VL, S.OpValue);
14994 return std::nullopt;
15007 ScheduleData *Bundle = getScheduleData(OpValue);
15008 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
15009 assert(!Bundle->IsScheduled &&
15010 "Can't cancel bundle which is already scheduled");
15011 assert(Bundle->isSchedulingEntity() &&
15013 "tried to unbundle something which is not a bundle");
15016 if (Bundle->isReady())
15017 ReadyInsts.remove(Bundle);
15020 ScheduleData *BundleMember = Bundle;
15021 while (BundleMember) {
15022 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
15023 BundleMember->FirstInBundle = BundleMember;
15024 ScheduleData *Next = BundleMember->NextInBundle;
15025 BundleMember->NextInBundle =
nullptr;
15026 BundleMember->TE =
nullptr;
15027 if (BundleMember->unscheduledDepsInBundle() == 0) {
15028 ReadyInsts.insert(BundleMember);
15030 BundleMember = Next;
15034BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
15036 if (ChunkPos >= ChunkSize) {
15037 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
15040 return &(ScheduleDataChunks.back()[ChunkPos++]);
15043bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
15044 Value *V,
const InstructionsState &S) {
15046 assert(
I &&
"bundle member must be an instruction");
15049 "phi nodes/insertelements/extractelements/extractvalues don't need to "
15051 if (getScheduleData(
I))
15053 if (!ScheduleStart) {
15055 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
15057 ScheduleEnd =
I->getNextNode();
15058 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
15059 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
15067 ++ScheduleStart->getIterator().getReverse();
15072 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
15073 return II->isAssumeLikeIntrinsic();
15076 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
15077 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
15078 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
15080 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
15081 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
15088 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
15089 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
15091 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
15092 assert(
I->getParent() == ScheduleStart->getParent() &&
15093 "Instruction is in wrong basic block.");
15094 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
15100 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
15101 "Expected to reach top of the basic block or instruction down the "
15103 assert(
I->getParent() == ScheduleEnd->getParent() &&
15104 "Instruction is in wrong basic block.");
15105 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
15107 ScheduleEnd =
I->getNextNode();
15108 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
15109 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
15113void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
15115 ScheduleData *PrevLoadStore,
15116 ScheduleData *NextLoadStore) {
15117 ScheduleData *CurrentLoadStore = PrevLoadStore;
15122 ScheduleData *SD = ScheduleDataMap.lookup(
I);
15124 SD = allocateScheduleDataChunks();
15125 ScheduleDataMap[
I] = SD;
15127 assert(!isInSchedulingRegion(SD) &&
15128 "new ScheduleData already in scheduling region");
15129 SD->init(SchedulingRegionID,
I);
15131 if (
I->mayReadOrWriteMemory() &&
15132 (!isa<IntrinsicInst>(
I) ||
15133 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
15134 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
15135 Intrinsic::pseudoprobe))) {
15137 if (CurrentLoadStore) {
15138 CurrentLoadStore->NextLoadStore = SD;
15140 FirstLoadStoreInRegion = SD;
15142 CurrentLoadStore = SD;
15145 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
15146 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
15147 RegionHasStackSave =
true;
15149 if (NextLoadStore) {
15150 if (CurrentLoadStore)
15151 CurrentLoadStore->NextLoadStore = NextLoadStore;
15153 LastLoadStoreInRegion = CurrentLoadStore;
15157void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
15158 bool InsertInReadyList,
15160 assert(SD->isSchedulingEntity());
15165 while (!WorkList.
empty()) {
15167 for (ScheduleData *BundleMember = SD; BundleMember;
15168 BundleMember = BundleMember->NextInBundle) {
15169 assert(isInSchedulingRegion(BundleMember));
15170 if (BundleMember->hasValidDependencies())
15175 BundleMember->Dependencies = 0;
15176 BundleMember->resetUnscheduledDeps();
15179 for (
User *U : BundleMember->Inst->
users()) {
15180 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
15181 BundleMember->Dependencies++;
15182 ScheduleData *DestBundle = UseSD->FirstInBundle;
15183 if (!DestBundle->IsScheduled)
15184 BundleMember->incrementUnscheduledDeps(1);
15185 if (!DestBundle->hasValidDependencies())
15191 auto *DepDest = getScheduleData(
I);
15192 assert(DepDest &&
"must be in schedule window");
15193 DepDest->ControlDependencies.push_back(BundleMember);
15194 BundleMember->Dependencies++;
15195 ScheduleData *DestBundle = DepDest->FirstInBundle;
15196 if (!DestBundle->IsScheduled)
15197 BundleMember->incrementUnscheduledDeps(1);
15198 if (!DestBundle->hasValidDependencies())
15206 for (
Instruction *
I = BundleMember->Inst->getNextNode();
15207 I != ScheduleEnd;
I =
I->getNextNode()) {
15212 MakeControlDependent(
I);
15220 if (RegionHasStackSave) {
15224 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
15225 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
15226 for (
Instruction *
I = BundleMember->Inst->getNextNode();
15227 I != ScheduleEnd;
I =
I->getNextNode()) {
15228 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
15229 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
15234 if (!isa<AllocaInst>(
I))
15238 MakeControlDependent(
I);
15247 if (isa<AllocaInst>(BundleMember->Inst) ||
15248 BundleMember->Inst->mayReadOrWriteMemory()) {
15249 for (
Instruction *
I = BundleMember->Inst->getNextNode();
15250 I != ScheduleEnd;
I =
I->getNextNode()) {
15251 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
15252 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
15256 MakeControlDependent(
I);
15263 ScheduleData *DepDest = BundleMember->NextLoadStore;
15268 "NextLoadStore list for non memory effecting bundle?");
15270 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
15271 unsigned NumAliased = 0;
15272 unsigned DistToSrc = 1;
15274 for (; DepDest; DepDest = DepDest->NextLoadStore) {
15275 assert(isInSchedulingRegion(DepDest));
15285 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
15287 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
15294 DepDest->MemoryDependencies.push_back(BundleMember);
15295 BundleMember->Dependencies++;
15296 ScheduleData *DestBundle = DepDest->FirstInBundle;
15297 if (!DestBundle->IsScheduled) {
15298 BundleMember->incrementUnscheduledDeps(1);
15300 if (!DestBundle->hasValidDependencies()) {
15323 if (InsertInReadyList && SD->isReady()) {
15324 ReadyInsts.insert(SD);
15331void BoUpSLP::BlockScheduling::resetSchedule() {
15333 "tried to reset schedule on block which has not been scheduled");
15334 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
15335 if (ScheduleData *SD = getScheduleData(
I)) {
15336 assert(isInSchedulingRegion(SD) &&
15337 "ScheduleData not in scheduling region");
15338 SD->IsScheduled =
false;
15339 SD->resetUnscheduledDeps();
15342 ReadyInsts.clear();
15345void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
15346 if (!BS->ScheduleStart)
15349 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
15356 BS->resetSchedule();
15363 struct ScheduleDataCompare {
15364 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
15365 return SD2->SchedulingPriority < SD1->SchedulingPriority;
15368 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
15373 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
15374 I =
I->getNextNode()) {
15375 if (ScheduleData *SD = BS->getScheduleData(
I)) {
15376 TreeEntry *SDTE = getTreeEntry(SD->Inst);
15379 SD->isPartOfBundle() ==
15381 "scheduler and vectorizer bundle mismatch");
15382 SD->FirstInBundle->SchedulingPriority =
Idx++;
15384 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
15385 BS->calculateDependencies(SD,
false,
this);
15388 BS->initialFillReadyList(ReadyInsts);
15390 Instruction *LastScheduledInst = BS->ScheduleEnd;
15393 while (!ReadyInsts.empty()) {
15394 ScheduleData *Picked = *ReadyInsts.begin();
15395 ReadyInsts.erase(ReadyInsts.begin());
15399 for (ScheduleData *BundleMember = Picked; BundleMember;
15400 BundleMember = BundleMember->NextInBundle) {
15404 LastScheduledInst = PickedInst;
15407 BS->schedule(Picked, ReadyInsts);
15411#ifdef EXPENSIVE_CHECKS
15415#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15417 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
15418 ScheduleData *SD = BS->getScheduleData(
I);
15419 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
15420 assert(SD->IsScheduled &&
"must be scheduled at this point");
15425 BS->ScheduleStart =
nullptr;
15432 if (
auto *Store = dyn_cast<StoreInst>(V))
15433 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
15435 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
15438 auto E = InstrElementSize.
find(V);
15439 if (E != InstrElementSize.
end())
15448 if (
auto *
I = dyn_cast<Instruction>(V)) {
15456 Value *FirstNonBool =
nullptr;
15457 while (!Worklist.
empty()) {
15462 auto *Ty =
I->getType();
15463 if (isa<VectorType>(Ty))
15465 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
15472 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
15473 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
15481 for (
Use &U :
I->operands()) {
15482 if (
auto *J = dyn_cast<Instruction>(U.get()))
15483 if (Visited.
insert(J).second &&
15484 (isa<PHINode>(
I) || J->getParent() == Parent)) {
15488 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
15489 FirstNonBool = U.get();
15500 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
15502 Width =
DL->getTypeSizeInBits(V->getType());
15506 InstrElementSize[
I] = Width;
15511bool BoUpSLP::collectValuesToDemote(
15512 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
15514 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
15515 bool IsTruncRoot)
const {
15517 if (
all_of(E.Scalars, IsaPred<Constant>))
15520 unsigned OrigBitWidth =
15521 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
15530 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
15531 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15533 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
15540 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
15546 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15549 if (
auto *
I = dyn_cast<Instruction>(V)) {
15551 unsigned BitWidth2 =
15552 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
15553 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15559 BitWidth1 = std::min(BitWidth1, BitWidth2);
15564 using namespace std::placeholders;
15565 auto FinalAnalysis = [&]() {
15566 if (!IsProfitableToDemote)
15569 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
15571 if (Res && E.isGather()) {
15575 for (
Value *V : E.Scalars) {
15576 auto *EE = dyn_cast<ExtractElementInst>(V);
15579 UniqueBases.
insert(EE->getVectorOperand());
15581 const unsigned VF = E.Scalars.size();
15582 Type *OrigScalarTy = E.Scalars.front()->getType();
15583 if (UniqueBases.
size() <= 2 ||
15591 if (E.isGather() || !Visited.
insert(&E).second ||
15593 return all_of(V->users(), [&](User *U) {
15594 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15597 return FinalAnalysis();
15600 return !all_of(V->users(), [=](User *U) {
15601 return getTreeEntry(U) ||
15602 (E.Idx == 0 && UserIgnoreList &&
15603 UserIgnoreList->contains(U)) ||
15604 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15605 !U->getType()->isScalableTy() &&
15606 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15607 }) && !IsPotentiallyTruncated(V,
BitWidth);
15612 bool &NeedToExit) {
15613 NeedToExit =
false;
15614 unsigned InitLevel = MaxDepthLevel;
15616 unsigned Level = InitLevel;
15617 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
15618 ToDemote, Visited, Level, IsProfitableToDemote,
15620 if (!IsProfitableToDemote)
15623 if (!FinalAnalysis())
15627 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15631 auto AttemptCheckBitwidth =
15634 NeedToExit =
false;
15635 unsigned BestFailBitwidth = 0;
15637 if (Checker(
BitWidth, OrigBitWidth))
15639 if (BestFailBitwidth == 0 && FinalAnalysis())
15643 if (BestFailBitwidth == 0) {
15654 auto TryProcessInstruction =
15661 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15666 if (E.UserTreeIndices.size() > 1 &&
15667 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15670 bool NeedToExit =
false;
15671 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15675 if (!ProcessOperands(
Operands, NeedToExit))
15684 return IsProfitableToDemote;
15686 switch (E.getOpcode()) {
15690 case Instruction::Trunc:
15691 if (IsProfitableToDemoteRoot)
15692 IsProfitableToDemote =
true;
15693 return TryProcessInstruction(
BitWidth);
15694 case Instruction::ZExt:
15695 case Instruction::SExt:
15696 IsProfitableToDemote =
true;
15697 return TryProcessInstruction(
BitWidth);
15701 case Instruction::Add:
15702 case Instruction::Sub:
15703 case Instruction::Mul:
15704 case Instruction::And:
15705 case Instruction::Or:
15706 case Instruction::Xor: {
15707 return TryProcessInstruction(
15708 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15710 case Instruction::Shl: {
15715 auto *I = cast<Instruction>(V);
15716 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15717 return AmtKnownBits.getMaxValue().ult(BitWidth);
15720 return TryProcessInstruction(
15721 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15723 case Instruction::LShr: {
15727 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15729 auto *I = cast<Instruction>(V);
15730 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15731 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15732 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15733 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15734 SimplifyQuery(*DL));
15737 return TryProcessInstruction(
15738 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15741 case Instruction::AShr: {
15745 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15747 auto *I = cast<Instruction>(V);
15748 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15749 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15750 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15751 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15755 return TryProcessInstruction(
15756 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15759 case Instruction::UDiv:
15760 case Instruction::URem: {
15762 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15765 auto *I = cast<Instruction>(V);
15766 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15767 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15768 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15771 return TryProcessInstruction(
15772 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15776 case Instruction::Select: {
15777 return TryProcessInstruction(
15778 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15783 case Instruction::PHI: {
15784 const unsigned NumOps = E.getNumOperands();
15787 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
15789 return TryProcessInstruction(
BitWidth, Ops);
15792 case Instruction::Call: {
15793 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15797 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
15798 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
15802 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15805 auto *I = cast<Instruction>(V);
15806 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15807 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15808 return MaskedValueIsZero(I->getOperand(0), Mask,
15809 SimplifyQuery(*DL)) &&
15810 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15812 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
15813 "Expected min/max intrinsics only.");
15814 unsigned SignBits = OrigBitWidth -
BitWidth;
15820 return SignBits <= Op0SignBits &&
15821 ((SignBits != Op0SignBits &&
15825 SignBits <= Op1SignBits &&
15826 ((SignBits != Op1SignBits &&
15831 if (
ID != Intrinsic::abs) {
15832 Operands.push_back(getOperandEntry(&E, 1));
15833 CallChecker = CompChecker;
15836 std::numeric_limits<InstructionCost::CostType>::max();
15838 unsigned VF = E.Scalars.size();
15847 if (
Cost < BestCost) {
15853 [[maybe_unused]]
bool NeedToExit;
15854 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15864 return FinalAnalysis();
15871 bool IsStoreOrInsertElt =
15872 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15873 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15874 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15875 ExtraBitWidthNodes.
size() <= 1 &&
15876 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15877 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15880 unsigned NodeIdx = 0;
15881 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15885 if (VectorizableTree[NodeIdx]->
isGather() ||
15886 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
15887 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15890 static_cast<int>(NodeIdx);
15896 bool IsTruncRoot =
false;
15897 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15899 if (NodeIdx != 0 &&
15900 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15901 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
15902 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
15903 IsTruncRoot =
true;
15905 IsProfitableToDemoteRoot =
true;
15910 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
15914 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
15915 bool IsProfitableToDemoteRoot,
unsigned Opcode,
15916 unsigned Limit,
bool IsTruncRoot,
15917 bool IsSignedCmp) ->
unsigned {
15921 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15922 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15924 return V->hasOneUse() || isa<Constant>(V) ||
15927 const TreeEntry *TE = getTreeEntry(U);
15928 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15929 if (TE == UserTE || !TE)
15931 unsigned UserTESz = DL->getTypeSizeInBits(
15932 UserTE->Scalars.front()->getType());
15933 auto It = MinBWs.find(TE);
15934 if (It != MinBWs.end() && It->second.first > UserTESz)
15936 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
15940 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15941 auto It = MinBWs.
find(UserTE);
15942 if (It != MinBWs.
end())
15943 return It->second.first;
15944 unsigned MaxBitWidth =
15945 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
15946 MaxBitWidth =
bit_ceil(MaxBitWidth);
15947 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15949 return MaxBitWidth;
15952 unsigned VF = E.getVectorFactor();
15953 Type *ScalarTy = E.Scalars.front()->getType();
15955 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
15956 if (!TreeRootIT || !Opcode)
15960 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
15969 unsigned MaxBitWidth = 1u;
15977 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
15978 KnownBits Known = computeKnownBits(R, *DL);
15979 return Known.isNonNegative();
15984 for (
Value *Root : E.Scalars) {
15988 unsigned BitWidth1 = NumTypeBits - NumSignBits;
16004 if (!IsKnownPositive)
16008 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
16010 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
16013 if (MaxBitWidth < 8 && MaxBitWidth > 1)
16018 if (NumParts > 1 &&
16024 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
16025 Opcode == Instruction::SExt ||
16026 Opcode == Instruction::ZExt || NumParts > 1;
16031 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
16032 bool NeedToDemote = IsProfitableToDemote;
16034 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
16035 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
16037 (MaxDepthLevel <= Limit &&
16038 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
16039 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
16040 DL->getTypeSizeInBits(TreeRootIT) /
16041 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
16047 MaxBitWidth =
bit_ceil(MaxBitWidth);
16049 return MaxBitWidth;
16056 if (UserIgnoreList &&
16057 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
16058 for (
Value *V : *UserIgnoreList) {
16060 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
16061 unsigned BitWidth1 = NumTypeBits - NumSignBits;
16064 unsigned BitWidth2 = BitWidth1;
16067 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
16069 ReductionBitWidth =
16070 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
16072 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
16073 ReductionBitWidth = 8;
16075 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
16077 bool IsTopRoot = NodeIdx == 0;
16078 while (NodeIdx < VectorizableTree.size() &&
16079 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
16080 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
16083 IsTruncRoot =
true;
16085 bool IsSignedCmp =
false;
16086 while (NodeIdx < VectorizableTree.size()) {
16088 unsigned Limit = 2;
16089 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
16091 ReductionBitWidth ==
16092 DL->getTypeSizeInBits(
16093 VectorizableTree.front()->Scalars.front()->getType()))
16095 unsigned MaxBitWidth = ComputeMaxBitWidth(
16096 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
16097 Limit, IsTruncRoot, IsSignedCmp);
16098 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
16099 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
16100 ReductionBitWidth =
bit_ceil(MaxBitWidth);
16101 else if (MaxBitWidth == 0)
16102 ReductionBitWidth = 0;
16105 for (
unsigned Idx : RootDemotes) {
16108 DL->getTypeSizeInBits(V->getType()->getScalarType());
16109 if (OrigBitWidth > MaxBitWidth) {
16117 RootDemotes.clear();
16119 IsProfitableToDemoteRoot =
true;
16121 if (ExtraBitWidthNodes.
empty()) {
16122 NodeIdx = VectorizableTree.size();
16124 unsigned NewIdx = 0;
16126 NewIdx = *ExtraBitWidthNodes.
begin();
16127 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
16128 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
16131 NodeIdx < VectorizableTree.size() &&
16132 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
16135 EI.
UserTE->getOpcode() == Instruction::Trunc &&
16136 !EI.
UserTE->isAltShuffle();
16139 NodeIdx < VectorizableTree.size() &&
16140 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
16142 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
16144 auto *IC = dyn_cast<ICmpInst>(V);
16147 !isKnownNonNegative(IC->getOperand(0),
16148 SimplifyQuery(*DL)) ||
16149 !isKnownNonNegative(IC->getOperand(1),
16150 SimplifyQuery(*DL)));
16157 if (MaxBitWidth == 0 ||
16159 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
16161 if (UserIgnoreList)
16168 for (
unsigned Idx : ToDemote) {
16169 TreeEntry *TE = VectorizableTree[
Idx].get();
16172 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
16173 return !isKnownNonNegative(R, SimplifyQuery(*DL));
16191 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
16216 DL = &
F.getDataLayout();
16220 bool Changed =
false;
16226 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
16231 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
16234 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
16238 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
16247 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
16249 R.clearReductionData();
16250 collectSeedInstructions(BB);
16253 if (!Stores.empty()) {
16255 <<
" underlying objects.\n");
16256 Changed |= vectorizeStoreChains(R);
16260 Changed |= vectorizeChainsInBlock(BB, R);
16265 if (!GEPs.
empty()) {
16267 <<
" underlying objects.\n");
16268 Changed |= vectorizeGEPIndices(BB, R);
16273 R.optimizeGatherSequence();
16281 unsigned Idx,
unsigned MinVF,
16286 const unsigned Sz = R.getVectorElementSize(Chain[0]);
16287 unsigned VF = Chain.
size();
16301 for (
Value *V : Chain)
16302 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
16305 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
16310 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
16311 (!S.MainOp->isSafeToRemove() ||
16314 return !isa<ExtractElementInst>(V) &&
16315 (V->getNumUses() > Chain.size() ||
16316 any_of(V->users(), [&](User *U) {
16317 return !Stores.contains(U);
16320 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
16321 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
16325 if (
R.isLoadCombineCandidate(Chain))
16327 R.buildTree(Chain);
16329 if (
R.isTreeTinyAndNotFullyVectorizable()) {
16330 if (
R.isGathered(Chain.front()) ||
16331 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
16332 return std::nullopt;
16333 Size =
R.getTreeSize();
16336 R.reorderTopToBottom();
16337 R.reorderBottomToTop();
16338 R.transformNodes();
16339 R.buildExternalUses();
16341 R.computeMinimumValueSizes();
16343 Size =
R.getTreeSize();
16344 if (S.getOpcode() == Instruction::Load)
16352 using namespace ore;
16355 cast<StoreInst>(Chain[0]))
16356 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
16357 <<
" and with tree size "
16358 <<
NV(
"TreeSize",
R.getTreeSize()));
16372 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
16373 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
16374 unsigned Size = First ? Val.first : Val.second;
16386 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
16387 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
16388 unsigned P = First ? Val.first : Val.second;
16391 return V + (P - Mean) * (P - Mean);
16394 return Dev * 81 / (Mean * Mean) == 0;
16397bool SLPVectorizerPass::vectorizeStores(
16399 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16404 bool Changed =
false;
16406 struct StoreDistCompare {
16407 bool operator()(
const std::pair<unsigned, int> &Op1,
16408 const std::pair<unsigned, int> &Op2)
const {
16409 return Op1.second < Op2.second;
16414 using StoreIndexToDistSet =
16415 std::set<std::pair<unsigned, int>, StoreDistCompare>;
16416 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
16421 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
16423 PrevDist =
Data.second;
16424 if (
Idx !=
Set.size() - 1)
16429 Operands.push_back(Stores[DataVar.first]);
16430 PrevDist = DataVar.second;
16435 .
insert({Operands.front(),
16436 cast<StoreInst>(Operands.front())->getValueOperand(),
16438 cast<StoreInst>(Operands.back())->getValueOperand(),
16443 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
16444 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
16448 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
16449 unsigned MaxRegVF = MaxVF;
16451 Type *StoreTy =
Store->getValueOperand()->getType();
16452 Type *ValueTy = StoreTy;
16453 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
16454 ValueTy = Trunc->getSrcTy();
16455 if (ValueTy == StoreTy &&
16456 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
16458 unsigned MinVF = std::max<unsigned>(
16460 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
16463 if (MaxVF < MinVF) {
16464 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
16466 <<
"MinVF (" << MinVF <<
")\n");
16470 unsigned NonPowerOf2VF = 0;
16475 unsigned CandVF =
Operands.size();
16477 NonPowerOf2VF = CandVF;
16482 unsigned Size = MinVF;
16484 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
16488 unsigned Repeat = 0;
16489 constexpr unsigned MaxAttempts = 4;
16491 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
16492 P.first =
P.second = 1;
16495 auto IsNotVectorized = [](
bool First,
16496 const std::pair<unsigned, unsigned> &
P) {
16497 return First ?
P.first > 0 :
P.second > 0;
16499 auto IsVectorized = [](
bool First,
16500 const std::pair<unsigned, unsigned> &
P) {
16501 return First ?
P.first == 0 :
P.second == 0;
16503 auto VFIsProfitable = [](
bool First,
unsigned Size,
16504 const std::pair<unsigned, unsigned> &
P) {
16507 auto FirstSizeSame = [](
unsigned Size,
16508 const std::pair<unsigned, unsigned> &
P) {
16509 return Size ==
P.first;
16513 bool RepeatChanged =
false;
16514 bool AnyProfitableGraph =
false;
16515 for (
unsigned Size : CandidateVFs) {
16516 AnyProfitableGraph =
false;
16517 unsigned StartIdx = std::distance(
16518 RangeSizes.begin(),
16519 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
16520 std::placeholders::_1)));
16521 while (StartIdx <
End) {
16523 std::distance(RangeSizes.begin(),
16524 find_if(RangeSizes.drop_front(StartIdx),
16525 std::bind(IsVectorized,
Size >= MaxRegVF,
16526 std::placeholders::_1)));
16527 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
16528 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
16530 Size >= MaxRegVF)) {
16537 return cast<StoreInst>(V)
16538 ->getValueOperand()
16540 cast<StoreInst>(Slice.
front())
16541 ->getValueOperand()
16544 "Expected all operands of same type.");
16545 if (!NonSchedulable.empty()) {
16546 auto [NonSchedSizeMax, NonSchedSizeMin] =
16547 NonSchedulable.lookup(Slice.
front());
16548 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
16549 Cnt += NonSchedSizeMax;
16554 std::optional<bool> Res =
16555 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
16559 .first->getSecond()
16567 AnyProfitableGraph = RepeatChanged = Changed =
true;
16571 [](std::pair<unsigned, unsigned> &
P) {
16572 P.first = P.second = 0;
16574 if (Cnt < StartIdx + MinVF) {
16575 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
16576 [](std::pair<unsigned, unsigned> &
P) {
16577 P.first = P.second = 0;
16579 StartIdx = Cnt +
Size;
16581 if (Cnt > Sz -
Size - MinVF) {
16583 [](std::pair<unsigned, unsigned> &
P) {
16584 P.first = P.second = 0;
16593 if (
Size > 2 && Res &&
16595 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
16596 std::placeholders::_1))) {
16602 if (
Size > MaxRegVF && TreeSize > 1 &&
16604 std::bind(FirstSizeSame, TreeSize,
16605 std::placeholders::_1))) {
16607 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16613 [&](std::pair<unsigned, unsigned> &
P) {
16614 if (Size >= MaxRegVF)
16615 P.second = std::max(P.second, TreeSize);
16617 P.first = std::max(P.first, TreeSize);
16620 AnyProfitableGraph =
true;
16622 if (StartIdx >=
End)
16624 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16625 AnyProfitableGraph =
true;
16626 StartIdx = std::distance(
16627 RangeSizes.begin(),
16628 find_if(RangeSizes.drop_front(Sz),
16629 std::bind(IsNotVectorized,
Size >= MaxRegVF,
16630 std::placeholders::_1)));
16632 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
16636 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
16637 return P.first == 0 &&
P.second == 0;
16641 if (Repeat >= MaxAttempts ||
16642 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16644 constexpr unsigned StoresLimit = 64;
16645 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
16647 static_cast<unsigned>(
16650 RangeSizes.begin(),
16651 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
16652 std::placeholders::_1))) +
16655 if (VF > MaxTotalNum || VF >= StoresLimit)
16657 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
16659 P.first = std::max(
P.second,
P.first);
16663 CandidateVFs.clear();
16664 CandidateVFs.push_back(VF);
16711 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16713 Stores[
Set.first]->getValueOperand()->getType(),
16714 Stores[
Set.first]->getPointerOperand(),
16715 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
16719 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
16720 if (It ==
Set.second.end()) {
16721 Set.second.emplace(
Idx, *Diff);
16725 TryToVectorize(
Set.second);
16726 StoreIndexToDistSet PrevSet;
16727 PrevSet.swap(
Set.second);
16729 Set.second.emplace(
Idx, 0);
16732 unsigned StartIdx = It->first + 1;
16737 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
16739 if (Pair.first <= It->first ||
16740 VectorizedStores.
contains(Stores[Pair.first]))
16742 unsigned BI = Pair.first - StartIdx;
16743 UsedStores.set(BI);
16744 Dists[BI] = Pair.second - It->second;
16746 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
16747 unsigned BI =
I - StartIdx;
16748 if (UsedStores.test(BI))
16749 Set.second.emplace(
I, Dists[BI]);
16753 auto &Res = SortedStores.emplace_back();
16755 Res.second.emplace(
Idx, 0);
16757 Type *PrevValTy =
nullptr;
16759 if (
R.isDeleted(SI))
16762 PrevValTy =
SI->getValueOperand()->getType();
16764 if (PrevValTy !=
SI->getValueOperand()->getType()) {
16765 for (
auto &Set : SortedStores)
16766 TryToVectorize(
Set.second);
16767 SortedStores.clear();
16768 PrevValTy =
SI->getValueOperand()->getType();
16770 FillStoresSet(
I, SI);
16774 for (
auto &Set : SortedStores)
16775 TryToVectorize(
Set.second);
16780void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
16791 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
16792 if (!
SI->isSimple())
16802 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
16803 if (
GEP->getNumIndices() != 1)
16806 if (isa<Constant>(
Idx))
16810 if (
GEP->getType()->isVectorTy())
16822 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
16823 << VL.
size() <<
".\n");
16828 if (!S.getOpcode())
16834 for (
Value *V : VL) {
16835 Type *Ty =
V->getType();
16839 R.getORE()->emit([&]() {
16840 std::string TypeStr;
16844 <<
"Cannot SLP vectorize list: type "
16845 << TypeStr +
" is unsupported by vectorizer";
16851 unsigned Sz =
R.getVectorElementSize(I0);
16852 unsigned MinVF =
R.getMinVF(Sz);
16853 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
16854 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16856 R.getORE()->emit([&]() {
16858 <<
"Cannot SLP vectorize list: vectorization factor "
16859 <<
"less than 2 is not supported";
16864 bool Changed =
false;
16865 bool CandidateFound =
false;
16867 Type *ScalarTy = VL[0]->getType();
16868 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16869 ScalarTy =
IE->getOperand(1)->getType();
16871 unsigned NextInst = 0, MaxInst = VL.size();
16872 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16879 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
16880 unsigned ActualVF = std::min(MaxInst -
I, VF);
16885 if (MaxVFOnly && ActualVF < MaxVF)
16887 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16893 auto *
I = dyn_cast<Instruction>(V);
16894 return I &&
R.isDeleted(
I);
16898 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
16902 if (
R.isTreeTinyAndNotFullyVectorizable())
16904 R.reorderTopToBottom();
16905 R.reorderBottomToTop(
16906 !isa<InsertElementInst>(Ops.
front()) &&
16907 !
R.doesRootHaveInTreeUses());
16908 R.transformNodes();
16909 R.buildExternalUses();
16911 R.computeMinimumValueSizes();
16913 CandidateFound =
true;
16914 MinCost = std::min(MinCost,
Cost);
16917 <<
" for VF=" << ActualVF <<
"\n");
16921 cast<Instruction>(Ops[0]))
16922 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
16923 <<
" and with tree size "
16924 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
16935 if (!Changed && CandidateFound) {
16936 R.getORE()->emit([&]() {
16938 <<
"List vectorization was possible but not beneficial with cost "
16939 <<
ore::NV(
"Cost", MinCost) <<
" >= "
16942 }
else if (!Changed) {
16943 R.getORE()->emit([&]() {
16945 <<
"Cannot SLP vectorize list: vectorization was impossible"
16946 <<
" with available vectorization factors";
16956 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
16962 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
16963 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
16964 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
16971 auto *
A = dyn_cast<BinaryOperator>(Op0);
16972 auto *
B = dyn_cast<BinaryOperator>(Op1);
16974 if (
A &&
B &&
B->hasOneUse()) {
16975 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
16976 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
16977 if (B0 && B0->getParent() ==
P)
16979 if (B1 && B1->getParent() ==
P)
16983 if (
B &&
A &&
A->hasOneUse()) {
16984 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
16985 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
16986 if (A0 && A0->getParent() ==
P)
16988 if (A1 && A1->getParent() ==
P)
16992 if (Candidates.
size() == 1)
16993 return tryToVectorizeList({Op0, Op1},
R);
16996 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
16997 if (!BestCandidate)
16999 return tryToVectorizeList(
17000 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
17034 ReductionOpsListType ReductionOps;
17044 bool IsSupportedHorRdxIdentityOp =
false;
17055 return isa<SelectInst>(
I) &&
17061 if (Kind == RecurKind::None)
17069 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
17073 return I->getFastMathFlags().noNaNs();
17076 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
17079 return I->isAssociative();
17088 return I->getOperand(2);
17089 return I->getOperand(
Index);
17097 case RecurKind::Or:
17103 case RecurKind::And:
17109 case RecurKind::Add:
17110 case RecurKind::Mul:
17111 case RecurKind::Xor:
17112 case RecurKind::FAdd:
17113 case RecurKind::FMul:
17116 case RecurKind::FMax:
17118 case RecurKind::FMin:
17120 case RecurKind::FMaximum:
17122 case RecurKind::FMinimum:
17124 case RecurKind::SMax:
17130 case RecurKind::SMin:
17136 case RecurKind::UMax:
17142 case RecurKind::UMin:
17157 const ReductionOpsListType &ReductionOps) {
17158 bool UseSelect = ReductionOps.size() == 2 ||
17160 (ReductionOps.size() == 1 &&
17161 any_of(ReductionOps.front(), IsaPred<SelectInst>));
17162 assert((!UseSelect || ReductionOps.size() != 2 ||
17163 isa<SelectInst>(ReductionOps[1][0])) &&
17164 "Expected cmp + select pairs for reduction");
17167 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
17181 auto *
I = dyn_cast<Instruction>(V);
17183 return RecurKind::None;
17185 return RecurKind::Add;
17187 return RecurKind::Mul;
17190 return RecurKind::And;
17193 return RecurKind::Or;
17195 return RecurKind::Xor;
17197 return RecurKind::FAdd;
17199 return RecurKind::FMul;
17202 return RecurKind::FMax;
17204 return RecurKind::FMin;
17207 return RecurKind::FMaximum;
17209 return RecurKind::FMinimum;
17215 return RecurKind::SMax;
17217 return RecurKind::SMin;
17219 return RecurKind::UMax;
17221 return RecurKind::UMin;
17223 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
17245 if (!isa<ExtractElementInst>(
RHS) ||
17247 return RecurKind::None;
17249 if (!isa<ExtractElementInst>(
LHS) ||
17251 return RecurKind::None;
17253 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
17254 return RecurKind::None;
17258 return RecurKind::None;
17263 return RecurKind::None;
17266 return RecurKind::SMax;
17269 return RecurKind::SMin;
17272 return RecurKind::UMax;
17275 return RecurKind::UMin;
17278 return RecurKind::None;
17282 static unsigned getFirstOperandIndex(
Instruction *
I) {
17283 return isCmpSelMinMax(
I) ? 1 : 0;
17289 return isCmpSelMinMax(
I) ? 3 : 2;
17295 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
17296 auto *Sel = cast<SelectInst>(
I);
17297 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
17298 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
17300 return I->getParent() == BB;
17304 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
17305 if (IsCmpSelMinMax) {
17308 if (
auto *Sel = dyn_cast<SelectInst>(
I))
17309 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
17310 return I->hasNUses(2);
17314 return I->hasOneUse();
17319 if (isCmpSelMinMax(
I))
17320 ReductionOps.assign(2, ReductionOpsType());
17322 ReductionOps.assign(1, ReductionOpsType());
17327 if (isCmpSelMinMax(
I)) {
17328 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
17329 ReductionOps[1].emplace_back(
I);
17331 ReductionOps[0].emplace_back(
I);
17336 int Sz = Data.size();
17337 auto *
I = dyn_cast<Instruction>(Data.front());
17338 return Sz > 1 ||
isConstant(Data.front()) ||
17349 RdxKind = HorizontalReduction::getRdxKind(Root);
17350 if (!isVectorizable(RdxKind, Root))
17361 if (
auto *Sel = dyn_cast<SelectInst>(Root))
17362 if (!Sel->getCondition()->hasOneUse())
17365 ReductionRoot = Root;
17370 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
17372 1, std::make_pair(Root, 0));
17380 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
17381 getNumberOfOperands(TreeN)))) {
17382 Value *EdgeVal = getRdxOperand(TreeN,
I);
17383 ReducedValsToOps[EdgeVal].push_back(TreeN);
17384 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
17391 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
17392 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
17393 !isVectorizable(RdxKind, EdgeInst) ||
17394 (
R.isAnalyzedReductionRoot(EdgeInst) &&
17395 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
17396 PossibleReducedVals.push_back(EdgeVal);
17399 ReductionOps.push_back(EdgeInst);
17408 PossibleReducedVals;
17409 initReductionOps(Root);
17413 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
17417 auto LIt = LoadsMap.
find(
Ptr);
17418 if (LIt != LoadsMap.
end()) {
17419 for (
LoadInst *RLI : LIt->second) {
17425 for (
LoadInst *RLI : LIt->second) {
17432 if (LIt->second.size() > 2) {
17434 hash_value(LIt->second.back()->getPointerOperand());
17439 LoadKeyUsed.
insert(Key);
17444 while (!Worklist.empty()) {
17445 auto [TreeN, Level] = Worklist.pop_back_val();
17448 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
17449 addReductionOps(TreeN);
17452 for (
Value *V : PossibleRedVals) {
17456 ++PossibleReducedVals[
Key][
Idx]
17457 .
insert(std::make_pair(V, 0))
17461 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
17463 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
17466 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
17467 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
17469 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
17472 auto RedValsVect = It->second.takeVector();
17474 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
17475 PossibleRedValsVect.
back().append(Data.second, Data.first);
17477 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
17478 return P1.size() > P2.size();
17483 (!isGoodForReduction(Data) &&
17484 (!isa<LoadInst>(Data.front()) ||
17485 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
17487 cast<LoadInst>(Data.front())->getPointerOperand()) !=
17489 cast<LoadInst>(ReducedVals[NewIdx].front())
17491 NewIdx = ReducedVals.
size();
17494 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
17508 constexpr int ReductionLimit = 4;
17509 constexpr unsigned RegMaxNumber = 4;
17510 constexpr unsigned RedValsMaxNumber = 128;
17514 unsigned NumReducedVals =
17515 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
17517 if (!isGoodForReduction(Vals))
17519 return Num + Vals.size();
17521 if (NumReducedVals < ReductionLimit &&
17526 for (ReductionOpsType &RdxOps : ReductionOps)
17527 for (
Value *RdxOp : RdxOps)
17528 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17539 ReducedVals.
front().size());
17544 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
17545 assert(isa<SelectInst>(RdxRootInst) &&
17546 "Expected min/max reduction to have select root instruction");
17547 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
17548 assert(isa<Instruction>(ScalarCond) &&
17549 "Expected min/max reduction to have compare condition");
17550 return cast<Instruction>(ScalarCond);
17554 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
17555 if (VectorizedTree) {
17558 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
17559 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
17562 auto It = ReducedValsToOps.
find(Res);
17563 if (It != ReducedValsToOps.
end() &&
17569 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
17575 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
17576 return isBoolLogicOp(cast<Instruction>(V));
17579 ReductionOps.front().size());
17580 for (ReductionOpsType &RdxOps : ReductionOps)
17581 for (
Value *RdxOp : RdxOps) {
17584 IgnoreList.insert(RdxOp);
17589 for (
Value *U : IgnoreList)
17590 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
17591 RdxFMF &= FPMO->getFastMathFlags();
17592 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17597 for (
Value *V : Candidates)
17598 TrackedVals.try_emplace(V, V);
17604 Value *VectorizedTree =
nullptr;
17605 bool CheckForReusedReductionOps =
false;
17610 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
17612 InstructionsState S = States[
I];
17616 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
17617 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17622 auto *Inst = dyn_cast<Instruction>(RdxVal);
17624 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17625 (S.getOpcode() && !Inst))
17628 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17630 bool ShuffledExtracts =
false;
17632 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17635 for (
Value *RV : ReducedVals[
I + 1]) {
17636 Value *RdxVal = TrackedVals.find(RV)->second;
17640 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
17643 CommonCandidates.push_back(RdxVal);
17644 TrackedToOrig.try_emplace(RdxVal, RV);
17649 Candidates.
swap(CommonCandidates);
17650 ShuffledExtracts =
true;
17658 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
17660 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
17661 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17662 if (
auto *ResI = dyn_cast<Instruction>(Res))
17663 V.analyzedReductionRoot(ResI);
17665 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17669 unsigned NumReducedVals = Candidates.
size();
17670 if (NumReducedVals < ReductionLimit &&
17677 IsSupportedHorRdxIdentityOp =
17679 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17682 if (IsSupportedHorRdxIdentityOp)
17683 for (
Value *V : Candidates)
17684 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
17695 bool SameScaleFactor =
false;
17696 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17697 SameValuesCounter.
size() != Candidates.size();
17698 if (OptReusedScalars) {
17700 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17701 RdxKind == RecurKind::Xor) &&
17703 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
17704 return P.second == SameValuesCounter.
front().second;
17706 Candidates.resize(SameValuesCounter.
size());
17707 transform(SameValuesCounter, Candidates.begin(),
17708 [](
const auto &
P) { return P.first; });
17709 NumReducedVals = Candidates.size();
17711 if (NumReducedVals == 1) {
17712 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17713 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
17715 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17716 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17717 VectorizedVals.try_emplace(OrigV, Cnt);
17722 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
17723 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
17727 unsigned ReduxWidth = std::min<unsigned>(
17729 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17730 RegMaxNumber * RedValsMaxNumber));
17731 unsigned Start = 0;
17732 unsigned Pos = Start;
17734 unsigned PrevReduxWidth = ReduxWidth;
17735 bool CheckForReusedReductionOpsLocal =
false;
17736 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17737 &CheckForReusedReductionOpsLocal,
17738 &PrevReduxWidth, &
V,
17739 &IgnoreList](
bool IgnoreVL =
false) {
17740 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
17741 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17744 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17747 if (Pos < NumReducedVals - ReduxWidth + 1)
17748 return IsAnyRedOpGathered;
17751 return IsAnyRedOpGathered;
17753 bool AnyVectorized =
false;
17754 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17755 ReduxWidth >= ReductionLimit) {
17758 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17760 CheckForReusedReductionOps =
true;
17763 PrevReduxWidth = ReduxWidth;
17766 if (
V.areAnalyzedReductionVals(VL)) {
17767 (void)AdjustReducedVals(
true);
17773 auto *RedValI = dyn_cast<Instruction>(RedVal);
17776 return V.isDeleted(RedValI);
17779 V.buildTree(VL, IgnoreList);
17780 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
17781 if (!AdjustReducedVals())
17782 V.analyzedReductionVals(VL);
17785 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
17786 if (!AdjustReducedVals())
17787 V.analyzedReductionVals(VL);
17790 V.reorderTopToBottom();
17792 V.reorderBottomToTop(
true);
17799 LocalExternallyUsedValues[ReductionRoot];
17800 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
17801 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
17803 for (
Value *V : ReducedVals[Cnt])
17804 if (isa<Instruction>(V))
17805 LocalExternallyUsedValues[TrackedVals[V]];
17807 if (!IsSupportedHorRdxIdentityOp) {
17810 "Reused values counter map is not empty");
17811 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17812 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17814 Value *
V = Candidates[Cnt];
17815 Value *OrigV = TrackedToOrig.find(V)->second;
17816 ++SameValuesCounter[OrigV];
17822 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17823 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17825 Value *RdxVal = Candidates[Cnt];
17826 if (!Visited.
insert(RdxVal).second)
17830 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
17831 LocalExternallyUsedValues[RdxVal];
17834 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17836 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17837 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
17838 LocalExternallyUsedValues[RdxVal];
17841 if (!IsSupportedHorRdxIdentityOp)
17842 SameValuesCounter.
clear();
17843 for (
Value *RdxVal : VL)
17844 if (RequiredExtract.
contains(RdxVal))
17845 LocalExternallyUsedValues[RdxVal];
17846 V.transformNodes();
17847 V.buildExternalUses(LocalExternallyUsedValues);
17849 V.computeMinimumValueSizes();
17854 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17857 <<
" for reduction\n");
17861 V.getORE()->emit([&]() {
17863 SV_NAME,
"HorSLPNotBeneficial",
17864 ReducedValsToOps.
find(VL[0])->second.front())
17865 <<
"Vectorizing horizontal reduction is possible "
17866 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
17867 <<
" and threshold "
17870 if (!AdjustReducedVals())
17871 V.analyzedReductionVals(VL);
17875 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
17876 <<
Cost <<
". (HorRdx)\n");
17877 V.getORE()->emit([&]() {
17879 SV_NAME,
"VectorizedHorizontalReduction",
17880 ReducedValsToOps.
find(VL[0])->second.front())
17881 <<
"Vectorized horizontal reduction with cost "
17882 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
17883 <<
ore::NV(
"TreeSize",
V.getTreeSize());
17890 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17892 if (IsCmpSelMinMax)
17893 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17896 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
17897 ReplacedExternals, InsertPt);
17904 if ((isBoolLogicOp(RdxRootInst) ||
17905 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17907 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
17910 if (OptReusedScalars && !SameScaleFactor) {
17911 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
17912 SameValuesCounter, TrackedToOrig);
17915 Value *ReducedSubTree;
17916 Type *ScalarTy = VL.front()->getType();
17917 if (isa<FixedVectorType>(ScalarTy)) {
17922 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
17939 ReducedSubTree, emitReduction(Lane, Builder, ReduxWidth,
TTI),
17944 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
17946 if (ReducedSubTree->
getType() != VL.front()->getType()) {
17947 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
17948 "Expected different reduction type.");
17950 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
17951 V.isSignedMinBitwidthRootNode());
17957 if (OptReusedScalars && SameScaleFactor)
17958 ReducedSubTree = emitScaleForReusedOps(
17959 ReducedSubTree, Builder, SameValuesCounter.
front().second);
17961 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17963 for (
Value *RdxVal : VL) {
17964 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17965 if (IsSupportedHorRdxIdentityOp) {
17966 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17969 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17970 if (!
V.isVectorized(RdxVal))
17971 RequiredExtract.
insert(RdxVal);
17976 AnyVectorized =
true;
17978 if (OptReusedScalars && !AnyVectorized) {
17979 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
17980 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
17981 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17982 Value *OrigV = TrackedToOrig.find(
P.first)->second;
17983 VectorizedVals.try_emplace(OrigV,
P.second);
17988 if (VectorizedTree) {
18009 if (!AnyBoolLogicOp)
18011 if (isBoolLogicOp(RedOp1) &&
18012 ((!InitStep &&
LHS == VectorizedTree) ||
18015 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
18016 getRdxOperand(RedOp2, 0) ==
RHS ||
18021 if (
LHS != VectorizedTree)
18032 unsigned Sz = InstVals.
size();
18035 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
18038 Value *RdxVal1 = InstVals[
I].second;
18039 Value *StableRdxVal1 = RdxVal1;
18040 auto It1 = TrackedVals.find(RdxVal1);
18041 if (It1 != TrackedVals.end())
18042 StableRdxVal1 = It1->second;
18043 Value *RdxVal2 = InstVals[
I + 1].second;
18044 Value *StableRdxVal2 = RdxVal2;
18045 auto It2 = TrackedVals.find(RdxVal2);
18046 if (It2 != TrackedVals.end())
18047 StableRdxVal2 = It2->second;
18051 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
18053 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
18054 StableRdxVal2,
"op.rdx", ReductionOps);
18055 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
18058 ExtraReds[Sz / 2] = InstVals.
back();
18062 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
18066 for (
Value *RdxVal : Candidates) {
18067 if (!Visited.
insert(RdxVal).second)
18069 unsigned NumOps = VectorizedVals.lookup(RdxVal);
18077 bool InitStep =
true;
18078 while (ExtraReductions.
size() > 1) {
18080 FinalGen(ExtraReductions, InitStep);
18081 ExtraReductions.
swap(NewReds);
18084 VectorizedTree = ExtraReductions.
front().second;
18086 ReductionRoot->replaceAllUsesWith(VectorizedTree);
18095 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
18102 for (
auto *U :
Ignore->users()) {
18104 "All users must be either in the reduction ops list.");
18107 if (!
Ignore->use_empty()) {
18109 Ignore->replaceAllUsesWith(
P);
18112 V.removeInstructionsAndOperands(RdxOps);
18114 }
else if (!CheckForReusedReductionOps) {
18115 for (ReductionOpsType &RdxOps : ReductionOps)
18116 for (
Value *RdxOp : RdxOps)
18117 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
18119 return VectorizedTree;
18126 bool IsCmpSelMinMax,
unsigned ReduxWidth,
18129 Type *ScalarTy = ReducedVals.
front()->getType();
18138 int Cnt = ReducedVals.
size();
18139 for (
Value *RdxVal : ReducedVals) {
18144 Cost += GenCostFn();
18149 auto *RdxOp = cast<Instruction>(U);
18150 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
18158 Cost += ScalarCost;
18160 Cost += GenCostFn();
18165 case RecurKind::Add:
18166 case RecurKind::Mul:
18167 case RecurKind::Or:
18168 case RecurKind::And:
18169 case RecurKind::Xor:
18170 case RecurKind::FAdd:
18171 case RecurKind::FMul: {
18174 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
18177 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
18192 ScalarCost = EvaluateScalarCost([&]() {
18197 case RecurKind::FMax:
18198 case RecurKind::FMin:
18199 case RecurKind::FMaximum:
18200 case RecurKind::FMinimum:
18201 case RecurKind::SMax:
18202 case RecurKind::SMin:
18203 case RecurKind::UMax:
18204 case RecurKind::UMin: {
18208 ScalarCost = EvaluateScalarCost([&]() {
18218 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
18220 <<
" (It is a splitting reduction)\n");
18221 return VectorCost - ScalarCost;
18227 assert(VectorizedValue &&
"Need to have a vectorized tree node");
18229 "We only handle power-of-two reductions for now");
18230 assert(RdxKind != RecurKind::FMulAdd &&
18231 "A call to the llvm.fmuladd intrinsic is not handled yet");
18233 ++NumVectorInstructions;
18240 assert(IsSupportedHorRdxIdentityOp &&
18241 "The optimization of matched scalar identity horizontal reductions "
18242 "must be supported.");
18244 return VectorizedValue;
18246 case RecurKind::Add: {
18248 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
18250 << VectorizedValue <<
". (HorRdx)\n");
18251 return Builder.
CreateMul(VectorizedValue, Scale);
18253 case RecurKind::Xor: {
18255 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
18256 <<
". (HorRdx)\n");
18259 return VectorizedValue;
18261 case RecurKind::FAdd: {
18263 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
18265 << VectorizedValue <<
". (HorRdx)\n");
18266 return Builder.
CreateFMul(VectorizedValue, Scale);
18268 case RecurKind::And:
18269 case RecurKind::Or:
18270 case RecurKind::SMax:
18271 case RecurKind::SMin:
18272 case RecurKind::UMax:
18273 case RecurKind::UMin:
18274 case RecurKind::FMax:
18275 case RecurKind::FMin:
18276 case RecurKind::FMaximum:
18277 case RecurKind::FMinimum:
18279 return VectorizedValue;
18280 case RecurKind::Mul:
18281 case RecurKind::FMul:
18282 case RecurKind::FMulAdd:
18283 case RecurKind::IAnyOf:
18284 case RecurKind::FAnyOf:
18285 case RecurKind::None:
18297 assert(IsSupportedHorRdxIdentityOp &&
18298 "The optimization of matched scalar identity horizontal reductions "
18299 "must be supported.");
18301 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
18302 if (VTy->getElementType() != VL.
front()->getType()) {
18306 R.isSignedMinBitwidthRootNode());
18309 case RecurKind::Add: {
18312 for (
Value *V : VL) {
18313 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
18314 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
18318 << VectorizedValue <<
". (HorRdx)\n");
18319 return Builder.
CreateMul(VectorizedValue, Scale);
18321 case RecurKind::And:
18322 case RecurKind::Or:
18325 <<
". (HorRdx)\n");
18326 return VectorizedValue;
18327 case RecurKind::SMax:
18328 case RecurKind::SMin:
18329 case RecurKind::UMax:
18330 case RecurKind::UMin:
18331 case RecurKind::FMax:
18332 case RecurKind::FMin:
18333 case RecurKind::FMaximum:
18334 case RecurKind::FMinimum:
18337 <<
". (HorRdx)\n");
18338 return VectorizedValue;
18339 case RecurKind::Xor: {
18345 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
18347 std::iota(
Mask.begin(),
Mask.end(), 0);
18348 bool NeedShuffle =
false;
18349 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
18351 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
18352 if (Cnt % 2 == 0) {
18354 NeedShuffle =
true;
18360 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
18364 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
18365 return VectorizedValue;
18367 case RecurKind::FAdd: {
18370 for (
Value *V : VL) {
18371 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
18372 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
18375 return Builder.
CreateFMul(VectorizedValue, Scale);
18377 case RecurKind::Mul:
18378 case RecurKind::FMul:
18379 case RecurKind::FMulAdd:
18380 case RecurKind::IAnyOf:
18381 case RecurKind::FAnyOf:
18382 case RecurKind::None:
18392 return HorizontalReduction::getRdxKind(V);
18395 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
18396 return cast<FixedVectorType>(IE->getType())->getNumElements();
18398 unsigned AggregateSize = 1;
18399 auto *
IV = cast<InsertValueInst>(InsertInst);
18400 Type *CurrentType =
IV->getType();
18402 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
18403 for (
auto *Elt : ST->elements())
18404 if (Elt != ST->getElementType(0))
18405 return std::nullopt;
18406 AggregateSize *= ST->getNumElements();
18407 CurrentType = ST->getElementType(0);
18408 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
18409 AggregateSize *= AT->getNumElements();
18410 CurrentType = AT->getElementType();
18411 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
18412 AggregateSize *= VT->getNumElements();
18413 return AggregateSize;
18415 return AggregateSize;
18417 return std::nullopt;
18426 unsigned OperandOffset) {
18429 std::optional<unsigned> OperandIndex =
18433 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
18435 BuildVectorOpds, InsertElts, *OperandIndex);
18438 BuildVectorOpds[*OperandIndex] = InsertedOperand;
18439 InsertElts[*OperandIndex] = LastInsertInst;
18441 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
18442 }
while (LastInsertInst !=
nullptr &&
18443 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
18466 assert((isa<InsertElementInst>(LastInsertInst) ||
18467 isa<InsertValueInst>(LastInsertInst)) &&
18468 "Expected insertelement or insertvalue instruction!");
18471 "Expected empty result vectors!");
18474 if (!AggregateSize)
18476 BuildVectorOpds.
resize(*AggregateSize);
18477 InsertElts.
resize(*AggregateSize);
18482 if (BuildVectorOpds.
size() >= 2)
18500 auto DominatedReduxValue = [&](
Value *R) {
18501 return isa<Instruction>(R) &&
18502 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
18508 if (
P->getIncomingBlock(0) == ParentBB) {
18509 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
18510 }
else if (
P->getIncomingBlock(1) == ParentBB) {
18511 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
18514 if (Rdx && DominatedReduxValue(Rdx))
18527 if (
P->getIncomingBlock(0) == BBLatch) {
18528 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
18529 }
else if (
P->getIncomingBlock(1) == BBLatch) {
18530 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
18533 if (Rdx && DominatedReduxValue(Rdx))
18567 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18568 isa<IntrinsicInst>(Root)) &&
18569 "Expected binop, select, or intrinsic for reduction matching");
18571 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
18573 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
18575 return dyn_cast<Instruction>(
RHS);
18577 return dyn_cast<Instruction>(
LHS);
18584 Value *Op0 =
nullptr;
18585 Value *Op1 =
nullptr;
18588 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
18594 Value *B0 =
nullptr, *B1 =
nullptr;
18599bool SLPVectorizerPass::vectorizeHorReduction(
18604 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
18606 if (Root->
getParent() != BB || isa<PHINode>(Root))
18610 auto SelectRoot = [&]() {
18629 std::queue<std::pair<Instruction *, unsigned>>
Stack;
18630 Stack.emplace(SelectRoot(), 0);
18634 if (
R.isAnalyzedReductionRoot(Inst))
18639 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
18641 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
18643 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
18644 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18651 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18656 while (!
Stack.empty()) {
18659 std::tie(Inst, Level) =
Stack.front();
18664 if (
R.isDeleted(Inst))
18666 if (
Value *VectorizedV = TryToReduce(Inst)) {
18668 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
18670 Stack.emplace(
I, Level);
18673 if (
R.isDeleted(Inst))
18677 if (!TryAppendToPostponedInsts(Inst)) {
18688 if (VisitedInstrs.
insert(
Op).second)
18689 if (
auto *
I = dyn_cast<Instruction>(
Op))
18692 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
18693 !
R.isDeleted(
I) &&
I->getParent() == BB)
18694 Stack.emplace(
I, Level);
18703 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
18704 Res |= tryToVectorize(PostponedInsts, R);
18711 for (
Value *V : Insts)
18712 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
18713 Res |= tryToVectorize(Inst, R);
18717bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
18720 if (!
R.canMapToVector(IVI->
getType()))
18728 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
18729 R.getORE()->emit([&]() {
18731 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
18732 "trying reduction first.";
18736 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
18738 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
18748 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18752 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
18753 R.getORE()->emit([&]() {
18755 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
18756 "trying reduction first.";
18760 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
18761 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
18764template <
typename T>
18769 bool MaxVFOnly,
BoUpSLP &R) {
18770 bool Changed =
false;
18781 auto *
I = dyn_cast<Instruction>(*IncIt);
18782 if (!
I || R.isDeleted(
I)) {
18786 auto *SameTypeIt = IncIt;
18787 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18788 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18789 AreCompatible(*SameTypeIt, *IncIt))) {
18790 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
18792 if (
I && !R.isDeleted(
I))
18797 unsigned NumElts = VL.
size();
18798 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
18799 << NumElts <<
")\n");
18809 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
18812 VL.
swap(Candidates);
18813 Candidates.
clear();
18815 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
18821 auto GetMinNumElements = [&R](
Value *V) {
18822 unsigned EltSize = R.getVectorElementSize(V);
18823 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18825 if (NumElts < GetMinNumElements(*IncIt) &&
18826 (Candidates.
empty() ||
18827 Candidates.
front()->getType() == (*IncIt)->getType())) {
18829 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
18835 if (Candidates.
size() > 1 &&
18836 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18837 if (TryToVectorizeHelper(Candidates,
false)) {
18840 }
else if (MaxVFOnly) {
18843 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
18845 auto *
I = dyn_cast<Instruction>(*It);
18846 if (!
I || R.isDeleted(
I)) {
18850 auto *SameTypeIt = It;
18851 while (SameTypeIt !=
End &&
18852 (!isa<Instruction>(*SameTypeIt) ||
18853 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18854 AreCompatible(*SameTypeIt, *It))) {
18855 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
18857 if (
I && !R.isDeleted(
I))
18860 unsigned NumElts = VL.
size();
18861 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
18867 Candidates.
clear();
18871 IncIt = SameTypeIt;
18883template <
bool IsCompatibility>
18888 "Expected valid element types only.");
18890 return IsCompatibility;
18891 auto *CI1 = cast<CmpInst>(V);
18892 auto *CI2 = cast<CmpInst>(V2);
18893 if (CI1->getOperand(0)->getType()->getTypeID() <
18895 return !IsCompatibility;
18896 if (CI1->getOperand(0)->getType()->getTypeID() >
18899 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
18901 return !IsCompatibility;
18902 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
18911 if (BasePred1 < BasePred2)
18912 return !IsCompatibility;
18913 if (BasePred1 > BasePred2)
18916 bool CI1Preds = Pred1 == BasePred1;
18917 bool CI2Preds = Pred2 == BasePred1;
18918 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
18919 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
18920 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
18924 return !IsCompatibility;
18927 if (
auto *I1 = dyn_cast<Instruction>(Op1))
18928 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
18929 if (IsCompatibility) {
18930 if (I1->getParent() != I2->getParent())
18937 return NodeI2 !=
nullptr;
18940 assert((NodeI1 == NodeI2) ==
18942 "Different nodes should have different DFS numbers");
18943 if (NodeI1 != NodeI2)
18947 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18949 if (IsCompatibility)
18951 if (I1->getOpcode() != I2->getOpcode())
18952 return I1->getOpcode() < I2->getOpcode();
18955 return IsCompatibility;
18958template <
typename ItT>
18961 bool Changed =
false;
18964 if (
R.isDeleted(
I))
18967 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
18968 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
18972 if (
R.isDeleted(
I))
18974 Changed |= tryToVectorize(
I, R);
18981 return compareCmp<false>(V, V2, *TLI, *DT);
18984 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
18987 return compareCmp<true>(V1, V2, *TLI, *DT);
18994 if (Vals.
size() <= 1)
18996 Changed |= tryToVectorizeSequence<Value>(
18997 Vals, CompareSorter, AreCompatibleCompares,
19000 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
19002 auto *Select = dyn_cast<SelectInst>(U);
19004 Select->getParent() != cast<Instruction>(V)->getParent();
19007 if (ArePossiblyReducedInOtherBlock)
19009 return tryToVectorizeList(Candidates, R, MaxVFOnly);
19015bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
19017 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
19018 "This function only accepts Insert instructions");
19019 bool OpsChanged =
false;
19021 for (
auto *
I :
reverse(Instructions)) {
19023 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
19025 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
19027 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
19028 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
19030 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
19033 if (
R.isDeleted(
I))
19035 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
19036 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
19039 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
19041 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
19042 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
19043 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
19048 OpsChanged |= tryToVectorize(PostponedInsts, R);
19055 bool Changed =
false;
19062 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
19065 "Expected vectorizable types only.");
19073 V2->getType()->getScalarSizeInBits())
19076 V2->getType()->getScalarSizeInBits())
19080 if (Opcodes1.
size() < Opcodes2.
size())
19082 if (Opcodes1.
size() > Opcodes2.
size())
19084 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
19087 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
19088 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
19093 return NodeI2 !=
nullptr;
19096 assert((NodeI1 == NodeI2) ==
19098 "Different nodes should have different DFS numbers");
19099 if (NodeI1 != NodeI2)
19102 if (S.getOpcode() && !S.isAltShuffle())
19104 return I1->getOpcode() < I2->getOpcode();
19113 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
19114 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
19122 bool U1 = isa<UndefValue>(Opcodes1[
I]);
19123 bool U2 = isa<UndefValue>(Opcodes2[
I]);
19127 auto ValID1 = Opcodes1[
I]->getValueID();
19128 auto ValID2 = Opcodes2[
I]->getValueID();
19129 if (ValID1 == ValID2)
19131 if (ValID1 < ValID2)
19133 if (ValID1 > ValID2)
19142 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
19147 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
19150 if (V1->getType() !=
V2->getType())
19154 if (Opcodes1.
size() != Opcodes2.
size())
19156 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
19158 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
19160 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
19161 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
19162 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
19164 if (
I1->getParent() != I2->getParent())
19171 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
19173 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
19179 bool HaveVectorizedPhiNodes =
false;
19184 auto *
P = dyn_cast<PHINode>(&
I);
19190 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
19203 if (!Opcodes.
empty())
19207 while (!Nodes.
empty()) {
19208 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
19211 for (
Value *V :
PHI->incoming_values()) {
19212 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
19213 Nodes.push_back(PHI1);
19221 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
19222 Incoming, PHICompare, AreCompatiblePHIs,
19224 return tryToVectorizeList(Candidates, R, MaxVFOnly);
19227 Changed |= HaveVectorizedPhiNodes;
19228 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
19229 auto *
PHI = dyn_cast<PHINode>(
P.first);
19230 return !
PHI ||
R.isDeleted(
PHI);
19232 PHIToOpcodes.
clear();
19234 }
while (HaveVectorizedPhiNodes);
19236 VisitedInstrs.
clear();
19238 InstSetVector PostProcessInserts;
19242 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
19243 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
19244 if (VectorizeCmps) {
19245 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
19246 PostProcessCmps.
clear();
19248 PostProcessInserts.clear();
19253 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
19254 return PostProcessCmps.
contains(Cmp);
19255 return isa<InsertElementInst, InsertValueInst>(
I) &&
19256 PostProcessInserts.contains(
I);
19262 return I->use_empty() &&
19263 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
19268 if (isa<ScalableVectorType>(It->getType()))
19272 if (
R.isDeleted(&*It))
19275 if (!VisitedInstrs.
insert(&*It).second) {
19276 if (HasNoUsers(&*It) &&
19277 VectorizeInsertsAndCmps(It->isTerminator())) {
19287 if (isa<DbgInfoIntrinsic>(It))
19291 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
19293 if (
P->getNumIncomingValues() == 2) {
19296 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
19305 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
19310 if (BB ==
P->getIncomingBlock(
I) ||
19316 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
19317 PI && !IsInPostProcessInstrs(PI)) {
19318 bool Res = vectorizeRootInstruction(
nullptr, PI,
19319 P->getIncomingBlock(
I), R,
TTI);
19321 if (Res &&
R.isDeleted(
P)) {
19331 if (HasNoUsers(&*It)) {
19332 bool OpsChanged =
false;
19333 auto *
SI = dyn_cast<StoreInst>(It);
19343 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
19344 SI->getValueOperand()->hasOneUse();
19346 if (TryToVectorizeRoot) {
19347 for (
auto *V : It->operand_values()) {
19350 if (
auto *VI = dyn_cast<Instruction>(V);
19351 VI && !IsInPostProcessInstrs(VI))
19353 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
19360 VectorizeInsertsAndCmps(It->isTerminator());
19371 if (isa<InsertElementInst, InsertValueInst>(It))
19372 PostProcessInserts.insert(&*It);
19373 else if (isa<CmpInst>(It))
19374 PostProcessCmps.
insert(cast<CmpInst>(&*It));
19381 auto Changed =
false;
19382 for (
auto &Entry : GEPs) {
19385 if (
Entry.second.size() < 2)
19388 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
19389 <<
Entry.second.size() <<
".\n");
19397 return !R.isDeleted(GEP);
19399 if (It ==
Entry.second.end())
19401 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
19402 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
19403 if (MaxVecRegSize < EltSize)
19406 unsigned MaxElts = MaxVecRegSize / EltSize;
19407 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
19408 auto Len = std::min<unsigned>(BE - BI, MaxElts);
19421 Candidates.remove_if([&R](
Value *
I) {
19422 return R.isDeleted(cast<Instruction>(
I)) ||
19423 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
19431 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
19432 auto *GEPI = GEPList[
I];
19433 if (!Candidates.count(GEPI))
19436 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
19437 auto *GEPJ = GEPList[J];
19439 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
19440 Candidates.remove(GEPI);
19441 Candidates.remove(GEPJ);
19442 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19443 Candidates.remove(GEPJ);
19450 if (Candidates.
size() < 2)
19457 auto BundleIndex = 0
u;
19458 for (
auto *V : Candidates) {
19459 auto *
GEP = cast<GetElementPtrInst>(V);
19460 auto *GEPIdx =
GEP->idx_begin()->get();
19461 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19462 Bundle[BundleIndex++] = GEPIdx;
19474 Changed |= tryToVectorizeList(Bundle, R);
19480bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
19481 bool Changed =
false;
19486 if (
V->getValueOperand()->getType()->getTypeID() <
19487 V2->getValueOperand()->getType()->getTypeID())
19489 if (
V->getValueOperand()->getType()->getTypeID() >
19490 V2->getValueOperand()->getType()->getTypeID())
19492 if (
V->getPointerOperandType()->getTypeID() <
19493 V2->getPointerOperandType()->getTypeID())
19495 if (
V->getPointerOperandType()->getTypeID() >
19496 V2->getPointerOperandType()->getTypeID())
19498 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
19499 V2->getValueOperand()->getType()->getScalarSizeInBits())
19501 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
19502 V2->getValueOperand()->getType()->getScalarSizeInBits())
19505 if (isa<UndefValue>(
V->getValueOperand()) ||
19506 isa<UndefValue>(
V2->getValueOperand()))
19508 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
19509 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
19513 DT->
getNode(I2->getParent());
19514 assert(NodeI1 &&
"Should only process reachable instructions");
19515 assert(NodeI2 &&
"Should only process reachable instructions");
19516 assert((NodeI1 == NodeI2) ==
19518 "Different nodes should have different DFS numbers");
19519 if (NodeI1 != NodeI2)
19524 return I1->getOpcode() < I2->getOpcode();
19526 if (isa<Constant>(
V->getValueOperand()) &&
19527 isa<Constant>(
V2->getValueOperand()))
19529 return V->getValueOperand()->getValueID() <
19530 V2->getValueOperand()->getValueID();
19542 isa<UndefValue>(
V2->getValueOperand()))
19545 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
19546 if (
I1->getParent() != I2->getParent())
19549 return S.getOpcode() > 0;
19552 isa<Constant>(
V2->getValueOperand()))
19555 V2->getValueOperand()->getValueID();
19560 for (
auto &Pair : Stores) {
19561 if (Pair.second.size() < 2)
19565 << Pair.second.size() <<
".\n");
19574 Pair.second.rend());
19575 Changed |= tryToVectorizeSequence<StoreInst>(
19576 ReversedStores, StoreSorter, AreCompatibleStores,
19578 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.