73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
128 cl::desc(
"Attempt to vectorize horizontal reductions"));
133 "Attempt to vectorize horizontal reductions feeding into a store"));
139 cl::desc(
"Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
144 cl::desc(
"Attempt to vectorize for this register size in bits"));
148 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
156 cl::desc(
"Limit the size of the SLP scheduling region per block"));
160 cl::desc(
"Attempt to vectorize for this register size in bits"));
164 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
168 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
174 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
183 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
187 cl::desc(
"The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
192 cl::desc(
"The maximum stride, considered to be profitable."));
196 cl::desc(
"Display the SLP trees with Graphviz"));
200 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
230 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
237 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
244 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
245 !isa<ExtractValueInst, UndefValue>(V))
247 auto *
I = dyn_cast<Instruction>(V);
248 if (!
I || isa<ExtractValueInst>(
I))
250 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
252 if (isa<ExtractElementInst>(
I))
254 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
270 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
278 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
294 for (
int I = 1, E = VL.
size();
I < E;
I++) {
295 auto *
II = dyn_cast<Instruction>(VL[
I]);
299 if (BB !=
II->getParent())
316 Value *FirstNonUndef =
nullptr;
317 for (
Value *V : VL) {
318 if (isa<UndefValue>(V))
320 if (!FirstNonUndef) {
324 if (V != FirstNonUndef)
327 return FirstNonUndef !=
nullptr;
332 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
333 return Cmp->isCommutative();
334 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
335 return BO->isCommutative() ||
336 (BO->getOpcode() == Instruction::Sub &&
342 ICmpInst::Predicate Pred;
343 if (match(U.getUser(),
344 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
345 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
349 return match(U.getUser(),
350 m_Intrinsic<Intrinsic::abs>(
351 m_Specific(U.get()), m_ConstantInt(Flag))) &&
352 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
355 (BO->getOpcode() == Instruction::FSub &&
358 return match(U.getUser(),
359 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
361 return I->isCommutative();
369 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
370 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
373 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
376 if (CI->getValue().uge(VT->getNumElements()))
378 Index *= VT->getNumElements();
379 Index += CI->getZExtValue();
383 const auto *
IV = cast<InsertValueInst>(InsertInst);
384 Type *CurrentType =
IV->getType();
385 for (
unsigned I :
IV->indices()) {
386 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
387 Index *= ST->getNumElements();
388 CurrentType = ST->getElementType(
I);
389 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
390 Index *= AT->getNumElements();
391 CurrentType = AT->getElementType();
424 if (MaskArg == UseMask::UndefsAsMask)
428 if (MaskArg == UseMask::FirstArg &&
Value < VF)
429 UseMask.reset(
Value);
430 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
431 UseMask.reset(
Value - VF);
439template <
bool IsPoisonOnly = false>
443 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
446 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
449 auto *
C = dyn_cast<Constant>(V);
451 if (!UseMask.empty()) {
453 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
455 if (isa<T>(
II->getOperand(1)))
462 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
470 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
477 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
478 if (
Constant *Elem =
C->getAggregateElement(
I))
480 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
508static std::optional<TargetTransformInfo::ShuffleKind>
510 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
513 auto *EI0 = cast<ExtractElementInst>(*It);
514 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
517 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
518 Value *Vec1 =
nullptr;
519 Value *Vec2 =
nullptr;
521 auto *EE = dyn_cast<ExtractElementInst>(V);
524 Value *Vec = EE->getVectorOperand();
525 if (isa<UndefValue>(Vec))
530 ShuffleMode CommonShuffleMode =
Unknown;
532 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
534 if (isa<UndefValue>(VL[
I]))
536 auto *EI = cast<ExtractElementInst>(VL[
I]);
537 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
539 auto *Vec = EI->getVectorOperand();
541 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
544 if (isa<UndefValue>(Vec)) {
547 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
549 if (isa<UndefValue>(EI->getIndexOperand()))
551 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
557 unsigned IntIdx =
Idx->getValue().getZExtValue();
564 if (!Vec1 || Vec1 == Vec) {
566 }
else if (!Vec2 || Vec2 == Vec) {
572 if (CommonShuffleMode == Permute)
576 if (Mask[
I] %
Size !=
I) {
577 CommonShuffleMode = Permute;
580 CommonShuffleMode =
Select;
583 if (CommonShuffleMode ==
Select && Vec2)
594 assert((Opcode == Instruction::ExtractElement ||
595 Opcode == Instruction::ExtractValue) &&
596 "Expected extractelement or extractvalue instruction.");
597 if (Opcode == Instruction::ExtractElement) {
598 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
601 return CI->getZExtValue();
603 auto *EI = cast<ExtractValueInst>(E);
604 if (EI->getNumIndices() != 1)
606 return *EI->idx_begin();
612struct InstructionsState {
614 Value *OpValue =
nullptr;
625 unsigned getAltOpcode()
const {
630 bool isAltShuffle()
const {
return AltOp != MainOp; }
633 unsigned CheckedOpcode =
I->getOpcode();
634 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
637 InstructionsState() =
delete;
639 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
648 auto *
I = dyn_cast<Instruction>(
Op);
649 if (
I && S.isOpcodeOrAlt(
I))
668 unsigned BaseIndex = 0);
676 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
677 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
678 BaseOp0 == Op0 || BaseOp1 == Op1 ||
689 "Assessing comparisons of different types?");
699 return (BasePred == Pred &&
701 (BasePred == SwappedPred &&
710 unsigned BaseIndex) {
713 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
715 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
716 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
717 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
719 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
721 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
722 unsigned AltOpcode = Opcode;
723 unsigned AltIndex = BaseIndex;
725 bool SwappedPredsCompatible = [&]() {
729 UniquePreds.
insert(BasePred);
730 UniqueNonSwappedPreds.
insert(BasePred);
731 for (
Value *V : VL) {
732 auto *
I = dyn_cast<CmpInst>(V);
738 UniqueNonSwappedPreds.
insert(CurrentPred);
739 if (!UniquePreds.
contains(CurrentPred) &&
740 !UniquePreds.
contains(SwappedCurrentPred))
741 UniquePreds.
insert(CurrentPred);
746 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
750 auto *IBase = cast<Instruction>(VL[BaseIndex]);
753 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
757 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
759 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
760 auto *
I = cast<Instruction>(VL[Cnt]);
761 unsigned InstOpcode =
I->getOpcode();
762 if (IsBinOp && isa<BinaryOperator>(
I)) {
763 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
767 AltOpcode = InstOpcode;
771 }
else if (IsCastOp && isa<CastInst>(
I)) {
772 Value *Op0 = IBase->getOperand(0);
774 Value *Op1 =
I->getOperand(0);
777 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
779 if (Opcode == AltOpcode) {
782 "Cast isn't safe for alternation, logic needs to be updated!");
783 AltOpcode = InstOpcode;
788 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
789 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
790 Type *Ty0 = BaseInst->getOperand(0)->getType();
791 Type *Ty1 = Inst->getOperand(0)->getType();
793 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
800 if ((E == 2 || SwappedPredsCompatible) &&
801 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
806 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
807 if (AltIndex != BaseIndex) {
810 }
else if (BasePred != CurrentPred) {
813 "CmpInst isn't safe for alternation, logic needs to be updated!");
818 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
819 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
822 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
823 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
824 if (Gep->getNumOperands() != 2 ||
825 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
826 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
827 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
829 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
830 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
831 auto *BaseLI = cast<LoadInst>(IBase);
832 if (!LI->isSimple() || !BaseLI->isSimple())
833 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
834 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
835 auto *
CallBase = cast<CallInst>(IBase);
837 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
838 if (Call->hasOperandBundles() &&
839 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
840 Call->op_begin() + Call->getBundleOperandsEndIndex(),
843 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
846 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
849 if (Mappings.
size() != BaseMappings.
size() ||
850 Mappings.
front().ISA != BaseMappings.
front().ISA ||
851 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
852 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
853 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
854 Mappings.
front().Shape.Parameters !=
855 BaseMappings.
front().Shape.Parameters)
856 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
861 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
864 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
865 cast<Instruction>(VL[AltIndex]));
881 case Instruction::Load: {
882 LoadInst *LI = cast<LoadInst>(UserInst);
885 case Instruction::Store: {
886 StoreInst *SI = cast<StoreInst>(UserInst);
887 return (SI->getPointerOperand() == Scalar);
889 case Instruction::Call: {
890 CallInst *CI = cast<CallInst>(UserInst);
893 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
894 Arg.value().get() == Scalar;
906 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
913 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
914 return LI->isSimple();
916 return SI->isSimple();
918 return !
MI->isVolatile();
926 bool ExtendingManyInputs =
false) {
930 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
932 (SubMask.
size() == Mask.size() &&
933 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
934 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
935 "SubMask with many inputs support must be larger than the mask.");
937 Mask.append(SubMask.
begin(), SubMask.
end());
941 int TermValue = std::min(Mask.size(), SubMask.
size());
942 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
944 (!ExtendingManyInputs &&
945 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
947 NewMask[
I] = Mask[SubMask[
I]];
963 const unsigned Sz = Order.
size();
966 for (
unsigned I = 0;
I < Sz; ++
I) {
968 UnusedIndices.
reset(Order[
I]);
970 MaskedIndices.
set(
I);
972 if (MaskedIndices.
none())
975 "Non-synced masked/available indices.");
979 assert(
Idx >= 0 &&
"Indices must be synced.");
991 for (
unsigned Lane : seq<unsigned>(VL.
size()))
992 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
993 OpcodeMask.
set(Lane);
1002 const unsigned E = Indices.
size();
1004 for (
unsigned I = 0;
I < E; ++
I)
1005 Mask[Indices[
I]] =
I;
1011 assert(!Mask.empty() &&
"Expected non-empty mask.");
1015 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1017 Scalars[Mask[
I]] = Prev[
I];
1025 auto *
I = dyn_cast<Instruction>(V);
1030 auto *IO = dyn_cast<Instruction>(V);
1033 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1042 auto *
I = dyn_cast<Instruction>(V);
1046 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1048 auto *IU = dyn_cast<Instruction>(U);
1051 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1067 return !VL.
empty() &&
1071namespace slpvectorizer {
1076 struct ScheduleData;
1101 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1102 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1156 return !VectorizableTree.
empty() &&
1157 !VectorizableTree.
front()->UserTreeIndices.empty();
1162 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1163 return VectorizableTree.
front()->Scalars;
1178 VectorizableTree.
clear();
1179 ScalarToTreeEntry.clear();
1180 MultiNodeScalars.clear();
1182 NonScheduledFirst.
clear();
1183 EntryToLastInstruction.clear();
1184 ExternalUses.
clear();
1185 ExternalUsesAsGEPs.clear();
1186 for (
auto &Iter : BlocksSchedules) {
1187 BlockScheduling *BS = Iter.second.get();
1191 ReductionBitWidth = 0;
1192 CastMaxMinBWSizes.reset();
1193 ExtraBitWidthNodes.
clear();
1194 InstrElementSize.clear();
1195 UserIgnoreList =
nullptr;
1196 PostponedGathers.
clear();
1197 ValueToGatherNodes.
clear();
1254 return MaxVecRegSize;
1259 return MinVecRegSize;
1267 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1269 return MaxVF ? MaxVF : UINT_MAX;
1313 bool TryRecursiveCheck =
true)
const;
1337 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1338 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1360 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1361 MaxLevel(MaxLevel) {}
1415 if (isa<LoadInst>(V1)) {
1417 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1422 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1424 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1427 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1430 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1432 ((
int)V1->getNumUses() == NumLanes ||
1433 AllUsersAreInternal(V1, V2)))
1439 auto CheckSameEntryOrFail = [&]() {
1440 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1441 TE1 && TE1 == R.getTreeEntry(V2))
1446 auto *LI1 = dyn_cast<LoadInst>(V1);
1447 auto *LI2 = dyn_cast<LoadInst>(V2);
1449 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1451 return CheckSameEntryOrFail();
1454 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1455 LI2->getPointerOperand(),
DL, SE,
true);
1456 if (!Dist || *Dist == 0) {
1459 R.TTI->isLegalMaskedGather(
1463 return CheckSameEntryOrFail();
1467 if (std::abs(*Dist) > NumLanes / 2)
1476 auto *C1 = dyn_cast<Constant>(V1);
1477 auto *C2 = dyn_cast<Constant>(V2);
1491 if (isa<UndefValue>(V2))
1495 Value *EV2 =
nullptr;
1508 int Dist = Idx2 - Idx1;
1511 if (std::abs(Dist) == 0)
1513 if (std::abs(Dist) > NumLanes / 2)
1520 return CheckSameEntryOrFail();
1523 auto *I1 = dyn_cast<Instruction>(V1);
1524 auto *I2 = dyn_cast<Instruction>(V2);
1526 if (I1->getParent() != I2->getParent())
1527 return CheckSameEntryOrFail();
1534 if (S.getOpcode() &&
1535 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1536 !S.isAltShuffle()) &&
1538 return cast<Instruction>(V)->getNumOperands() ==
1539 S.MainOp->getNumOperands();
1545 if (isa<UndefValue>(V2))
1548 return CheckSameEntryOrFail();
1582 int ShallowScoreAtThisLevel =
1591 auto *I1 = dyn_cast<Instruction>(
LHS);
1592 auto *I2 = dyn_cast<Instruction>(
RHS);
1593 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1595 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1596 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1597 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1598 ShallowScoreAtThisLevel))
1599 return ShallowScoreAtThisLevel;
1600 assert(I1 && I2 &&
"Should have early exited.");
1607 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1608 OpIdx1 != NumOperands1; ++OpIdx1) {
1610 int MaxTmpScore = 0;
1611 unsigned MaxOpIdx2 = 0;
1612 bool FoundBest =
false;
1616 ? I2->getNumOperands()
1617 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1618 assert(FromIdx <= ToIdx &&
"Bad index");
1619 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1621 if (Op2Used.
count(OpIdx2))
1626 I1, I2, CurrLevel + 1, std::nullopt);
1629 TmpScore > MaxTmpScore) {
1630 MaxTmpScore = TmpScore;
1637 Op2Used.
insert(MaxOpIdx2);
1638 ShallowScoreAtThisLevel += MaxTmpScore;
1641 return ShallowScoreAtThisLevel;
1672 struct OperandData {
1673 OperandData() =
default;
1674 OperandData(
Value *V,
bool APO,
bool IsUsed)
1675 : V(V), APO(APO), IsUsed(IsUsed) {}
1685 bool IsUsed =
false;
1694 enum class ReorderingMode {
1711 const Loop *L =
nullptr;
1714 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1715 return OpsVec[OpIdx][Lane];
1719 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1720 return OpsVec[OpIdx][Lane];
1725 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1726 OpIdx != NumOperands; ++OpIdx)
1727 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1729 OpsVec[OpIdx][Lane].IsUsed =
false;
1733 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1734 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1746 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1747 Value *IdxLaneV = getData(
Idx, Lane).V;
1748 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1751 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1754 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1755 if (!isa<Instruction>(OpIdxLnV))
1757 Uniques.
insert(OpIdxLnV);
1759 int UniquesCount = Uniques.
size();
1760 int UniquesCntWithIdxLaneV =
1761 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1762 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1763 int UniquesCntWithOpIdxLaneV =
1764 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1765 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1768 UniquesCntWithOpIdxLaneV) -
1769 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1778 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1779 Value *IdxLaneV = getData(
Idx, Lane).V;
1780 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1789 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1790 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1792 return R.areAllUsersVectorized(IdxLaneI)
1800 static const int ScoreScaleFactor = 10;
1808 int Lane,
unsigned OpIdx,
unsigned Idx,
1818 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1819 if (Score <= -SplatScore) {
1824 Score += SplatScore;
1830 Score *= ScoreScaleFactor;
1831 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1849 std::optional<unsigned>
1850 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1853 unsigned NumOperands = getNumOperands();
1856 Value *OpLastLane = getData(OpIdx, LastLane).V;
1859 ReorderingMode RMode = ReorderingModes[OpIdx];
1860 if (RMode == ReorderingMode::Failed)
1861 return std::nullopt;
1864 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1870 std::optional<unsigned>
Idx;
1874 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1880 bool IsUsed = RMode == ReorderingMode::Splat ||
1881 RMode == ReorderingMode::Constant ||
1882 RMode == ReorderingMode::Load;
1884 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1886 OperandData &OpData = getData(
Idx, Lane);
1888 bool OpAPO = OpData.APO;
1897 if (OpAPO != OpIdxAPO)
1902 case ReorderingMode::Load:
1903 case ReorderingMode::Opcode: {
1904 bool LeftToRight = Lane > LastLane;
1905 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1906 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1907 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1908 OpIdx,
Idx, IsUsed);
1909 if (Score >
static_cast<int>(BestOp.Score) ||
1910 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
1913 BestOp.Score = Score;
1914 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1918 case ReorderingMode::Constant:
1919 if (isa<Constant>(
Op) ||
1920 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
1922 if (isa<Constant>(
Op)) {
1924 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1927 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
1931 case ReorderingMode::Splat:
1932 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
1933 IsUsed =
Op == OpLastLane;
1934 if (
Op == OpLastLane) {
1936 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1942 case ReorderingMode::Failed:
1948 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1952 return std::nullopt;
1959 unsigned getBestLaneToStartReordering()
const {
1960 unsigned Min = UINT_MAX;
1961 unsigned SameOpNumber = 0;
1972 for (
int I = getNumLanes();
I > 0; --
I) {
1973 unsigned Lane =
I - 1;
1974 OperandsOrderData NumFreeOpsHash =
1975 getMaxNumOperandsThatCanBeReordered(Lane);
1978 if (NumFreeOpsHash.NumOfAPOs < Min) {
1979 Min = NumFreeOpsHash.NumOfAPOs;
1980 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1982 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1983 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1984 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1987 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1988 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1989 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1990 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1991 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1992 if (It == HashMap.
end())
1993 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1999 unsigned BestLane = 0;
2000 unsigned CntMin = UINT_MAX;
2002 if (
Data.second.first < CntMin) {
2003 CntMin =
Data.second.first;
2004 BestLane =
Data.second.second;
2011 struct OperandsOrderData {
2014 unsigned NumOfAPOs = UINT_MAX;
2017 unsigned NumOpsWithSameOpcodeParent = 0;
2031 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2032 unsigned CntTrue = 0;
2033 unsigned NumOperands = getNumOperands();
2043 bool AllUndefs =
true;
2044 unsigned NumOpsWithSameOpcodeParent = 0;
2048 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2049 const OperandData &OpData = getData(OpIdx, Lane);
2054 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2056 I->getParent() != Parent) {
2057 if (NumOpsWithSameOpcodeParent == 0) {
2058 NumOpsWithSameOpcodeParent = 1;
2060 Parent =
I->getParent();
2062 --NumOpsWithSameOpcodeParent;
2065 ++NumOpsWithSameOpcodeParent;
2069 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2070 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2074 OperandsOrderData
Data;
2075 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2076 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2084 assert((empty() || VL.
size() == getNumLanes()) &&
2085 "Expected same number of lanes");
2086 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2087 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2088 constexpr unsigned IntrinsicNumOperands = 2;
2089 if (isa<IntrinsicInst>(VL[0]))
2090 NumOperands = IntrinsicNumOperands;
2091 OpsVec.
resize(NumOperands);
2092 unsigned NumLanes = VL.
size();
2093 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2094 OpsVec[OpIdx].
resize(NumLanes);
2095 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2096 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2107 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2108 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2109 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2116 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2119 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2122 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2123 return getData(OpIdx, Lane).V;
2127 bool empty()
const {
return OpsVec.
empty(); }
2130 void clear() { OpsVec.
clear(); }
2135 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2136 bool OpAPO = getData(OpIdx, Lane).APO;
2137 bool IsInvariant = L && L->isLoopInvariant(
Op);
2139 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2143 bool FoundCandidate =
false;
2144 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2145 OperandData &
Data = getData(OpI, Ln);
2146 if (
Data.APO != OpAPO ||
Data.IsUsed)
2148 Value *OpILane = getValue(OpI, Lane);
2149 bool IsConstantOp = isa<Constant>(OpILane);
2158 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2164 isa<Constant>(
Data.V)))) ||
2171 (IsInvariant && !isa<Constant>(
Data.V) &&
2173 L->isLoopInvariant(
Data.V))) {
2174 FoundCandidate =
true;
2181 if (!FoundCandidate)
2184 return getNumLanes() == 2 || Cnt > 1;
2190 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2194 appendOperandsOfVL(RootVL);
2201 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2202 "Expected same num of lanes across all operands");
2203 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2204 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2212 unsigned NumOperands = getNumOperands();
2213 unsigned NumLanes = getNumLanes();
2233 unsigned FirstLane = getBestLaneToStartReordering();
2236 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2237 Value *OpLane0 = getValue(OpIdx, FirstLane);
2240 if (isa<LoadInst>(OpLane0))
2241 ReorderingModes[OpIdx] = ReorderingMode::Load;
2242 else if (isa<Instruction>(OpLane0)) {
2244 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2245 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2247 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2249 else if (isa<Constant>(OpLane0))
2250 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2251 else if (isa<Argument>(OpLane0))
2253 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2256 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2263 auto &&SkipReordering = [
this]() {
2266 for (
const OperandData &
Data : Op0)
2269 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2288 if (SkipReordering())
2291 bool StrategyFailed =
false;
2299 for (
unsigned I = 0;
I < NumOperands; ++
I)
2300 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2302 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2305 int Lane = FirstLane +
Direction * Distance;
2306 if (Lane < 0 || Lane >= (
int)NumLanes)
2309 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2312 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2314 std::optional<unsigned> BestIdx = getBestOperand(
2315 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2322 swap(OpIdx, *BestIdx, Lane);
2325 StrategyFailed =
true;
2328 if (MainAltOps[OpIdx].
size() != 2) {
2329 OperandData &AltOp = getData(OpIdx, Lane);
2330 InstructionsState OpS =
2332 if (OpS.getOpcode() && OpS.isAltShuffle())
2339 if (!StrategyFailed)
2344#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2347 case ReorderingMode::Load:
2349 case ReorderingMode::Opcode:
2351 case ReorderingMode::Constant:
2353 case ReorderingMode::Splat:
2355 case ReorderingMode::Failed:
2376 const unsigned Indent = 2;
2379 OS <<
"Operand " << Cnt++ <<
"\n";
2380 for (
const OperandData &OpData : OpDataVec) {
2382 if (
Value *V = OpData.V)
2386 OS <<
", APO:" << OpData.APO <<
"}\n";
2408 int BestScore = Limit;
2409 std::optional<int>
Index;
2410 for (
int I : seq<int>(0, Candidates.size())) {
2412 Candidates[
I].second,
2415 if (Score > BestScore) {
2430 DeletedInstructions.insert(
I);
2436 return AnalyzedReductionsRoots.count(
I);
2441 AnalyzedReductionsRoots.insert(
I);
2455 AnalyzedReductionsRoots.clear();
2456 AnalyzedReductionVals.
clear();
2457 AnalyzedMinBWVals.
clear();
2469 return NonScheduledFirst.
contains(V);
2482 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2486 unsigned &MaxDepthLevel,
2487 bool &IsProfitableToDemote,
2488 bool IsTruncRoot)
const;
2498 canReorderOperands(TreeEntry *UserTE,
2505 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2509 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2511 TreeEntry *TE =
nullptr;
2513 TE = getTreeEntry(V);
2514 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2516 auto It = MultiNodeScalars.find(V);
2517 if (It != MultiNodeScalars.end()) {
2518 for (TreeEntry *E : It->second) {
2519 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2527 if (It != VL.
end()) {
2528 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2536 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2537 unsigned OpIdx)
const {
2538 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2539 const_cast<TreeEntry *
>(UserTE), OpIdx);
2543 bool areAllUsersVectorized(
2552 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2556 getCastContextHint(
const TreeEntry &TE)
const;
2565 const EdgeInfo &EI);
2576 bool ResizeAllowed =
false)
const;
2587 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2592 template <
typename BVTy,
typename ResTy,
typename...
Args>
2593 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2598 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2604 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2611 std::optional<TargetTransformInfo::ShuffleKind>
2623 unsigned NumParts)
const;
2635 std::optional<TargetTransformInfo::ShuffleKind>
2636 isGatherShuffledSingleRegisterEntry(
2653 isGatherShuffledEntry(
2656 unsigned NumParts,
bool ForOrder =
false);
2663 Type *ScalarTy)
const;
2667 void setInsertPointAfterBundle(
const TreeEntry *E);
2675 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2688 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2704 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2708 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2725 [Scalars](
Value *V,
int Idx) {
2726 return (isa<UndefValue>(V) &&
2727 Idx == PoisonMaskElem) ||
2728 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2731 if (!ReorderIndices.empty()) {
2738 return IsSame(Scalars, Mask);
2739 if (VL.
size() == ReuseShuffleIndices.size()) {
2741 return IsSame(Scalars, Mask);
2745 return IsSame(Scalars, ReuseShuffleIndices);
2748 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2749 return State == TreeEntry::NeedToGather &&
2750 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2751 UserTreeIndices.front().UserTE == UserEI.UserTE;
2755 bool hasEqualOperands(
const TreeEntry &TE)
const {
2756 if (
TE.getNumOperands() != getNumOperands())
2759 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2760 unsigned PrevCount =
Used.count();
2761 for (
unsigned K = 0;
K < E; ++
K) {
2764 if (getOperand(K) ==
TE.getOperand(
I)) {
2770 if (PrevCount ==
Used.count())
2779 unsigned getVectorFactor()
const {
2780 if (!ReuseShuffleIndices.empty())
2781 return ReuseShuffleIndices.size();
2782 return Scalars.
size();
2817 VecTreeTy &Container;
2841 assert(Operands[OpIdx].empty() &&
"Already resized?");
2843 "Number of operands is greater than the number of scalars.");
2849 void setOperandsInOrder() {
2851 auto *I0 = cast<Instruction>(Scalars[0]);
2852 Operands.resize(I0->getNumOperands());
2853 unsigned NumLanes = Scalars.size();
2854 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2855 OpIdx != NumOperands; ++OpIdx) {
2857 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2858 auto *
I = cast<Instruction>(Scalars[Lane]);
2859 assert(
I->getNumOperands() == NumOperands &&
2860 "Expected same number of operands");
2861 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2885 unsigned getNumOperands()
const {
return Operands.size(); }
2888 Value *getSingleOperand(
unsigned OpIdx)
const {
2890 assert(!Operands[OpIdx].empty() &&
"No operand available");
2895 bool isAltShuffle()
const {
return MainOp != AltOp; }
2898 unsigned CheckedOpcode =
I->getOpcode();
2899 return (getOpcode() == CheckedOpcode ||
2900 getAltOpcode() == CheckedOpcode);
2907 auto *
I = dyn_cast<Instruction>(
Op);
2908 if (
I && isOpcodeOrAlt(
I))
2913 void setOperations(
const InstructionsState &S) {
2927 unsigned getOpcode()
const {
2928 return MainOp ? MainOp->
getOpcode() : 0;
2931 unsigned getAltOpcode()
const {
2937 int findLaneForValue(
Value *V)
const {
2938 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2939 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2940 if (!ReorderIndices.
empty())
2941 FoundLane = ReorderIndices[FoundLane];
2942 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2943 if (!ReuseShuffleIndices.
empty()) {
2944 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2945 find(ReuseShuffleIndices, FoundLane));
2959 bool isNonPowOf2Vec()
const {
2961 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
2962 "Reshuffling not supported with non-power-of-2 vectors yet.");
2963 return IsNonPowerOf2;
2970 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2971 dbgs() <<
"Operand " << OpI <<
":\n";
2972 for (
const Value *V : Operands[OpI])
2975 dbgs() <<
"Scalars: \n";
2976 for (
Value *V : Scalars)
2978 dbgs() <<
"State: ";
2981 dbgs() <<
"Vectorize\n";
2983 case ScatterVectorize:
2984 dbgs() <<
"ScatterVectorize\n";
2986 case StridedVectorize:
2987 dbgs() <<
"StridedVectorize\n";
2990 dbgs() <<
"NeedToGather\n";
2993 dbgs() <<
"MainOp: ";
2995 dbgs() << *MainOp <<
"\n";
2998 dbgs() <<
"AltOp: ";
3000 dbgs() << *AltOp <<
"\n";
3003 dbgs() <<
"VectorizedValue: ";
3004 if (VectorizedValue)
3005 dbgs() << *VectorizedValue <<
"\n";
3008 dbgs() <<
"ReuseShuffleIndices: ";
3009 if (ReuseShuffleIndices.
empty())
3012 for (
int ReuseIdx : ReuseShuffleIndices)
3013 dbgs() << ReuseIdx <<
", ";
3015 dbgs() <<
"ReorderIndices: ";
3016 for (
unsigned ReorderIdx : ReorderIndices)
3017 dbgs() << ReorderIdx <<
", ";
3019 dbgs() <<
"UserTreeIndices: ";
3020 for (
const auto &EInfo : UserTreeIndices)
3021 dbgs() << EInfo <<
", ";
3028 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3031 dbgs() <<
"SLP: " << Banner <<
":\n";
3033 dbgs() <<
"SLP: Costs:\n";
3034 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3035 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3036 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3037 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3038 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3044 std::optional<ScheduleData *> Bundle,
3045 const InstructionsState &S,
3046 const EdgeInfo &UserTreeIdx,
3049 TreeEntry::EntryState EntryState =
3050 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3051 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3052 ReuseShuffleIndices, ReorderIndices);
3056 TreeEntry::EntryState EntryState,
3057 std::optional<ScheduleData *> Bundle,
3058 const InstructionsState &S,
3059 const EdgeInfo &UserTreeIdx,
3062 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3063 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3064 "Need to vectorize gather entry?");
3065 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3066 TreeEntry *
Last = VectorizableTree.
back().get();
3067 Last->Idx = VectorizableTree.
size() - 1;
3068 Last->State = EntryState;
3069 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3070 ReuseShuffleIndices.end());
3071 if (ReorderIndices.
empty()) {
3073 Last->setOperations(S);
3076 Last->Scalars.assign(VL.
size(),
nullptr);
3079 if (Idx >= VL.size())
3080 return UndefValue::get(VL.front()->getType());
3084 Last->setOperations(S);
3085 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3087 if (
Last->State != TreeEntry::NeedToGather) {
3088 for (
Value *V : VL) {
3089 const TreeEntry *
TE = getTreeEntry(V);
3091 "Scalar already in tree!");
3094 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3097 ScalarToTreeEntry[
V] =
Last;
3100 ScheduleData *BundleMember = *Bundle;
3101 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3104 "Bundle and VL out of sync");
3106 for (
Value *V : VL) {
3111 BundleMember->TE =
Last;
3112 BundleMember = BundleMember->NextInBundle;
3115 assert(!BundleMember &&
"Bundle and VL out of sync");
3118 bool AllConstsOrCasts =
true;
3121 auto *
I = dyn_cast<CastInst>(V);
3122 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3125 if (AllConstsOrCasts)
3127 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3128 MustGather.
insert(VL.begin(), VL.end());
3131 if (UserTreeIdx.UserTE) {
3132 Last->UserTreeIndices.push_back(UserTreeIdx);
3133 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3134 "Reordering isn't implemented for non-power-of-2 nodes yet");
3141 TreeEntry::VecTreeTy VectorizableTree;
3146 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3147 VectorizableTree[
Id]->dump();
3153 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3155 const TreeEntry *getTreeEntry(
Value *V)
const {
3156 return ScalarToTreeEntry.lookup(V);
3165 bool areAltOperandsProfitable(
const InstructionsState &S,
3170 TreeEntry::EntryState getScalarsVectorizationState(
3203 using ValueToGatherNodesMap =
3205 ValueToGatherNodesMap ValueToGatherNodes;
3208 struct ExternalUser {
3232 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3233 auto It = AliasCache.
find(Key);
3234 if (It != AliasCache.
end())
3239 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3243 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3275 UserList ExternalUses;
3295 struct ScheduleData {
3298 enum { InvalidDeps = -1 };
3300 ScheduleData() =
default;
3302 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3303 FirstInBundle =
this;
3304 NextInBundle =
nullptr;
3305 NextLoadStore =
nullptr;
3306 IsScheduled =
false;
3307 SchedulingRegionID = BlockSchedulingRegionID;
3308 clearDependencies();
3315 if (hasValidDependencies()) {
3316 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3318 assert(UnscheduledDeps == Dependencies &&
"invariant");
3322 assert(isSchedulingEntity() &&
3323 "unexpected scheduled state");
3324 for (
const ScheduleData *BundleMember =
this; BundleMember;
3325 BundleMember = BundleMember->NextInBundle) {
3326 assert(BundleMember->hasValidDependencies() &&
3327 BundleMember->UnscheduledDeps == 0 &&
3328 "unexpected scheduled state");
3329 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3330 "only bundle is marked scheduled");
3334 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3335 "all bundle members must be in same basic block");
3341 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3345 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3349 bool isPartOfBundle()
const {
3350 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3355 bool isReady()
const {
3356 assert(isSchedulingEntity() &&
3357 "can't consider non-scheduling entity for ready list");
3358 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3364 int incrementUnscheduledDeps(
int Incr) {
3365 assert(hasValidDependencies() &&
3366 "increment of unscheduled deps would be meaningless");
3367 UnscheduledDeps += Incr;
3368 return FirstInBundle->unscheduledDepsInBundle();
3373 void resetUnscheduledDeps() {
3374 UnscheduledDeps = Dependencies;
3378 void clearDependencies() {
3379 Dependencies = InvalidDeps;
3380 resetUnscheduledDeps();
3381 MemoryDependencies.clear();
3382 ControlDependencies.clear();
3385 int unscheduledDepsInBundle()
const {
3386 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3388 for (
const ScheduleData *BundleMember =
this; BundleMember;
3389 BundleMember = BundleMember->NextInBundle) {
3390 if (BundleMember->UnscheduledDeps == InvalidDeps)
3392 Sum += BundleMember->UnscheduledDeps;
3398 if (!isSchedulingEntity()) {
3399 os <<
"/ " << *Inst;
3400 }
else if (NextInBundle) {
3402 ScheduleData *SD = NextInBundle;
3404 os <<
';' << *SD->Inst;
3405 SD = SD->NextInBundle;
3416 Value *OpValue =
nullptr;
3419 TreeEntry *
TE =
nullptr;
3423 ScheduleData *FirstInBundle =
nullptr;
3427 ScheduleData *NextInBundle =
nullptr;
3431 ScheduleData *NextLoadStore =
nullptr;
3445 int SchedulingRegionID = 0;
3448 int SchedulingPriority = 0;
3454 int Dependencies = InvalidDeps;
3460 int UnscheduledDeps = InvalidDeps;
3464 bool IsScheduled =
false;
3469 const BoUpSLP::ScheduleData &SD) {
3494 struct BlockScheduling {
3496 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3500 ScheduleStart =
nullptr;
3501 ScheduleEnd =
nullptr;
3502 FirstLoadStoreInRegion =
nullptr;
3503 LastLoadStoreInRegion =
nullptr;
3504 RegionHasStackSave =
false;
3508 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3511 ScheduleRegionSize = 0;
3515 ++SchedulingRegionID;
3519 if (BB !=
I->getParent())
3522 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3523 if (SD && isInSchedulingRegion(SD))
3528 ScheduleData *getScheduleData(
Value *V) {
3529 if (
auto *
I = dyn_cast<Instruction>(V))
3530 return getScheduleData(
I);
3534 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3536 return getScheduleData(V);
3537 auto I = ExtraScheduleDataMap.find(V);
3538 if (
I != ExtraScheduleDataMap.end()) {
3539 ScheduleData *SD =
I->second.lookup(Key);
3540 if (SD && isInSchedulingRegion(SD))
3546 bool isInSchedulingRegion(ScheduleData *SD)
const {
3547 return SD->SchedulingRegionID == SchedulingRegionID;
3552 template <
typename ReadyListType>
3553 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3554 SD->IsScheduled =
true;
3557 for (ScheduleData *BundleMember = SD; BundleMember;
3558 BundleMember = BundleMember->NextInBundle) {
3559 if (BundleMember->Inst != BundleMember->OpValue)
3565 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3566 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3567 if (OpDef && OpDef->hasValidDependencies() &&
3568 OpDef->incrementUnscheduledDeps(-1) == 0) {
3572 ScheduleData *DepBundle = OpDef->FirstInBundle;
3573 assert(!DepBundle->IsScheduled &&
3574 "already scheduled bundle gets ready");
3575 ReadyList.insert(DepBundle);
3577 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3585 if (TreeEntry *TE = BundleMember->TE) {
3587 int Lane = std::distance(
TE->Scalars.begin(),
3588 find(
TE->Scalars, BundleMember->Inst));
3589 assert(Lane >= 0 &&
"Lane not set");
3597 auto *
In = BundleMember->Inst;
3600 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3601 In->getNumOperands() ==
TE->getNumOperands()) &&
3602 "Missed TreeEntry operands?");
3605 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3606 OpIdx != NumOperands; ++OpIdx)
3607 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3612 for (
Use &U : BundleMember->Inst->operands())
3613 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3617 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3618 if (MemoryDepSD->hasValidDependencies() &&
3619 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3622 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3623 assert(!DepBundle->IsScheduled &&
3624 "already scheduled bundle gets ready");
3625 ReadyList.insert(DepBundle);
3627 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3631 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3632 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3635 ScheduleData *DepBundle = DepSD->FirstInBundle;
3636 assert(!DepBundle->IsScheduled &&
3637 "already scheduled bundle gets ready");
3638 ReadyList.insert(DepBundle);
3640 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3651 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3652 ScheduleStart->comesBefore(ScheduleEnd) &&
3653 "Not a valid scheduling region?");
3655 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3656 auto *SD = getScheduleData(
I);
3659 assert(isInSchedulingRegion(SD) &&
3660 "primary schedule data not in window?");
3661 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3662 "entire bundle in window!");
3664 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3667 for (
auto *SD : ReadyInsts) {
3668 assert(SD->isSchedulingEntity() && SD->isReady() &&
3669 "item in ready list not ready?");
3674 void doForAllOpcodes(
Value *V,
3676 if (ScheduleData *SD = getScheduleData(V))
3678 auto I = ExtraScheduleDataMap.find(V);
3679 if (
I != ExtraScheduleDataMap.end())
3680 for (
auto &
P :
I->second)
3681 if (isInSchedulingRegion(
P.second))
3686 template <
typename ReadyListType>
3687 void initialFillReadyList(ReadyListType &ReadyList) {
3688 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3689 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3690 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3692 ReadyList.insert(SD);
3694 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3709 std::optional<ScheduleData *>
3711 const InstructionsState &S);
3717 ScheduleData *allocateScheduleDataChunks();
3721 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3726 ScheduleData *PrevLoadStore,
3727 ScheduleData *NextLoadStore);
3731 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3735 void resetSchedule();
3756 ExtraScheduleDataMap;
3769 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3773 ScheduleData *LastLoadStoreInRegion =
nullptr;
3778 bool RegionHasStackSave =
false;
3781 int ScheduleRegionSize = 0;
3790 int SchedulingRegionID = 1;
3798 void scheduleBlock(BlockScheduling *BS);
3805 struct OrdersTypeDenseMapInfo {
3818 static unsigned getHashValue(
const OrdersType &V) {
3839 unsigned MaxVecRegSize;
3840 unsigned MinVecRegSize;
3855 unsigned ReductionBitWidth = 0;
3859 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3878 struct ChildIteratorType
3880 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3891 return R.VectorizableTree[0].get();
3895 return {
N->UserTreeIndices.begin(),
N->Container};
3899 return {
N->UserTreeIndices.end(),
N->Container};
3904 class nodes_iterator {
3915 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3919 return nodes_iterator(R->VectorizableTree.begin());
3923 return nodes_iterator(R->VectorizableTree.end());
3926 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3937 OS << Entry->Idx <<
".\n";
3940 for (
auto *V : Entry->Scalars) {
3942 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3943 return EU.Scalar == V;
3953 if (Entry->State == TreeEntry::NeedToGather)
3955 if (Entry->State == TreeEntry::ScatterVectorize ||
3956 Entry->State == TreeEntry::StridedVectorize)
3957 return "color=blue";
3966 for (
auto *
I : DeletedInstructions) {
3967 for (
Use &U :
I->operands()) {
3968 auto *
Op = dyn_cast<Instruction>(U.get());
3969 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3973 I->dropAllReferences();
3975 for (
auto *
I : DeletedInstructions) {
3977 "trying to erase instruction with users.");
3978 I->eraseFromParent();
3984#ifdef EXPENSIVE_CHECKS
3995 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3996 "Expected non-empty mask.");
3999 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4001 Reuses[Mask[
I]] = Prev[
I];
4009 bool BottomOrder =
false) {
4010 assert(!Mask.empty() &&
"Expected non-empty mask.");
4011 unsigned Sz = Mask.size();
4014 if (Order.
empty()) {
4016 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4018 PrevOrder.
swap(Order);
4021 for (
unsigned I = 0;
I < Sz; ++
I)
4023 Order[
I] = PrevOrder[Mask[
I]];
4025 return Data.value() == Sz ||
Data.index() ==
Data.value();
4034 if (Order.
empty()) {
4036 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4046 for (
unsigned I = 0;
I < Sz; ++
I)
4048 Order[MaskOrder[
I]] =
I;
4052std::optional<BoUpSLP::OrdersType>
4054 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4058 Type *ScalarTy = GatheredScalars.
front()->getType();
4059 int NumScalars = GatheredScalars.
size();
4061 return std::nullopt;
4064 if (NumParts == 0 || NumParts >= NumScalars)
4070 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4072 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4075 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4076 return std::nullopt;
4077 OrdersType CurrentOrder(NumScalars, NumScalars);
4078 if (GatherShuffles.
size() == 1 &&
4080 Entries.front().front()->isSame(TE.Scalars)) {
4083 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4084 return CurrentOrder;
4088 return all_of(Mask, [&](
int I) {
4095 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4096 (Entries.size() != 1 ||
4097 Entries.front().front()->ReorderIndices.empty())) ||
4098 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4099 return std::nullopt;
4104 for (
int I : seq<int>(0, NumParts)) {
4105 if (ShuffledSubMasks.
test(
I))
4107 const int VF = GetVF(
I);
4113 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4114 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4115 ShuffledSubMasks.
set(
I);
4119 int FirstMin = INT_MAX;
4120 int SecondVecFound =
false;
4121 for (
int K : seq<int>(Limit)) {
4122 int Idx = Mask[
I * PartSz + K];
4124 Value *V = GatheredScalars[
I * PartSz + K];
4126 SecondVecFound =
true;
4135 SecondVecFound =
true;
4139 FirstMin = (FirstMin / PartSz) * PartSz;
4141 if (SecondVecFound) {
4142 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4143 ShuffledSubMasks.
set(
I);
4146 for (
int K : seq<int>(Limit)) {
4147 int Idx = Mask[
I * PartSz + K];
4151 if (
Idx >= PartSz) {
4152 SecondVecFound =
true;
4155 if (CurrentOrder[
I * PartSz +
Idx] >
4156 static_cast<unsigned>(
I * PartSz + K) &&
4157 CurrentOrder[
I * PartSz +
Idx] !=
4158 static_cast<unsigned>(
I * PartSz +
Idx))
4159 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4162 if (SecondVecFound) {
4163 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4164 ShuffledSubMasks.
set(
I);
4170 if (!ExtractShuffles.
empty())
4171 TransformMaskToOrder(
4172 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4173 if (!ExtractShuffles[
I])
4176 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4177 for (
unsigned Idx : seq<unsigned>(Sz)) {
4178 int K =
I * PartSz +
Idx;
4181 if (!TE.ReuseShuffleIndices.empty())
4182 K = TE.ReuseShuffleIndices[K];
4183 if (!TE.ReorderIndices.empty())
4184 K = std::distance(TE.ReorderIndices.begin(),
4185 find(TE.ReorderIndices, K));
4186 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4189 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4191 .getKnownMinValue());
4196 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4197 if (ShuffledSubMasks.
any())
4198 return std::nullopt;
4199 PartSz = NumScalars;
4202 if (!Entries.empty())
4203 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4204 if (!GatherShuffles[
I])
4206 return std::max(Entries[
I].front()->getVectorFactor(),
4207 Entries[
I].back()->getVectorFactor());
4210 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4211 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4212 return std::nullopt;
4213 return std::move(CurrentOrder);
4218 bool CompareOpcodes =
true) {
4221 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4224 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4227 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4231 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4236template <
typename T>
4238 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4240 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4241 return CommonAlignment;
4246 unsigned Sz = Order.
size();
4248 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4259static std::optional<Value *>
4265 const SCEV *PtrSCEVLowest =
nullptr;
4266 const SCEV *PtrSCEVHighest =
nullptr;
4272 return std::nullopt;
4274 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4275 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4279 if (isa<SCEVCouldNotCompute>(Diff))
4280 return std::nullopt;
4282 PtrSCEVLowest = PtrSCEV;
4286 if (isa<SCEVCouldNotCompute>(Diff1))
4287 return std::nullopt;
4289 PtrSCEVHighest = PtrSCEV;
4295 if (isa<SCEVCouldNotCompute>(Dist))
4296 return std::nullopt;
4297 int Size =
DL.getTypeStoreSize(ElemTy);
4298 auto TryGetStride = [&](
const SCEV *Dist,
4299 const SCEV *Multiplier) ->
const SCEV * {
4300 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4301 if (M->getOperand(0) == Multiplier)
4302 return M->getOperand(1);
4303 if (M->getOperand(1) == Multiplier)
4304 return M->getOperand(0);
4307 if (Multiplier == Dist)
4312 const SCEV *Stride =
nullptr;
4313 if (
Size != 1 || SCEVs.
size() > 2) {
4315 Stride = TryGetStride(Dist, Sz);
4317 return std::nullopt;
4319 if (!Stride || isa<SCEVConstant>(Stride))
4320 return std::nullopt;
4323 using DistOrdPair = std::pair<int64_t, int>;
4325 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4327 bool IsConsecutive =
true;
4328 for (
const SCEV *PtrSCEV : SCEVs) {
4330 if (PtrSCEV != PtrSCEVLowest) {
4332 const SCEV *Coeff = TryGetStride(Diff, Stride);
4334 return std::nullopt;
4335 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4336 if (!SC || isa<SCEVCouldNotCompute>(SC))
4337 return std::nullopt;
4341 return std::nullopt;
4342 Dist = SC->getAPInt().getZExtValue();
4346 return std::nullopt;
4347 auto Res = Offsets.emplace(Dist, Cnt);
4349 return std::nullopt;
4351 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4354 if (Offsets.size() != SCEVs.
size())
4355 return std::nullopt;
4356 SortedIndices.
clear();
4357 if (!IsConsecutive) {
4361 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4362 SortedIndices[Cnt] = Pair.second;
4372static std::pair<InstructionCost, InstructionCost>
4388 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4394 const unsigned Sz = VL.
size();
4396 auto *POIter = PointerOps.
begin();
4397 for (
Value *V : VL) {
4398 auto *L = cast<LoadInst>(V);
4401 *POIter = L->getPointerOperand();
4412 "supported with VectorizeNonPowerOf2");
4416 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4427 if (Order.
empty()) {
4428 Ptr0 = PointerOps.
front();
4429 PtrN = PointerOps.
back();
4431 Ptr0 = PointerOps[Order.
front()];
4432 PtrN = PointerOps[Order.
back()];
4434 std::optional<int> Diff =
4437 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4440 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4452 (
static_cast<unsigned>(std::abs(*Diff)) <=
4455 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4456 *Diff == -(
static_cast<int>(Sz) - 1))) {
4457 int Stride = *Diff /
static_cast<int>(Sz - 1);
4458 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4470 else if (
Ptr != Ptr0)
4475 if (((Dist / Stride) * Stride) != Dist ||
4476 !Dists.
insert(Dist).second)
4479 if (Dists.
size() == Sz)
4485 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4486 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4488 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4489 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4490 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4491 unsigned VectorizedCnt = 0;
4493 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4494 Cnt += VF, ++VectorizedCnt) {
4512 if (VectorizedCnt == VL.
size() / VF) {
4515 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4516 TTI, PointerOps, PointerOps.
front(), Instruction::GetElementPtr,
4520 Instruction::Load, VecTy,
4522 false, CommonAlignment,
CostKind) +
4523 VectorGEPCost - ScalarGEPCost;
4527 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4530 auto [ScalarGEPCost, VectorGEPCost] =
4532 LI0->getPointerOperand(), Instruction::Load,
4535 Instruction::Load, SubVecTy, LI0->getAlign(),
4536 LI0->getPointerAddressSpace(),
CostKind,
4538 VectorGEPCost - ScalarGEPCost;
4542 auto [ScalarGEPCost, VectorGEPCost] =
4544 LI0->getPointerOperand(), Instruction::Load,
4548 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4549 false, CommonAlignment,
CostKind) +
4550 VectorGEPCost - ScalarGEPCost;
4554 auto [ScalarGEPCost, VectorGEPCost] =
getGEPCosts(
4556 LI0->getPointerOperand(), Instruction::GetElementPtr,
4560 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4561 false, CommonAlignment,
CostKind) +
4562 VectorGEPCost - ScalarGEPCost;
4567 "Expected only consecutive, strided or masked gather loads.");
4570 for (
int Idx : seq<int>(0, VL.
size()))
4579 if (MaskedGatherCost >= VecLdCost)
4589 bool ProfitableGatherPointers =
4592 return L->isLoopInvariant(V);
4594 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4595 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4597 (
GEP &&
GEP->getNumOperands() == 2 &&
4598 isa<Constant, Instruction>(
GEP->getOperand(1)));
4600 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4605 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4624 "Expected list of pointer operands.");
4629 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4634 std::optional<int> Diff =
4640 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4646 if (Bases.
size() > VL.
size() / 2 - 1)
4650 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4656 bool AnyConsecutive =
false;
4657 for (
auto &
Base : Bases) {
4658 auto &Vec =
Base.second;
4659 if (Vec.size() > 1) {
4661 const std::tuple<Value *, int, unsigned> &
Y) {
4662 return std::get<1>(
X) < std::get<1>(
Y);
4664 int InitialOffset = std::get<1>(Vec[0]);
4666 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4672 SortedIndices.
clear();
4673 if (!AnyConsecutive)
4676 for (
auto &
Base : Bases) {
4677 for (
auto &
T :
Base.second)
4682 "Expected SortedIndices to be the size of VL");
4686std::optional<BoUpSLP::OrdersType>
4688 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4689 Type *ScalarTy = TE.Scalars[0]->getType();
4692 Ptrs.
reserve(TE.Scalars.size());
4693 for (
Value *V : TE.Scalars) {
4694 auto *L = dyn_cast<LoadInst>(V);
4695 if (!L || !L->isSimple())
4696 return std::nullopt;
4702 return std::move(Order);
4703 return std::nullopt;
4714 if (VU->
getType() != V->getType())
4717 if (!VU->
hasOneUse() && !V->hasOneUse())
4723 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4729 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4730 bool IsReusedIdx =
false;
4732 if (IE2 == VU && !IE1)
4734 if (IE1 == V && !IE2)
4735 return V->hasOneUse();
4736 if (IE1 && IE1 != V) {
4738 IsReusedIdx |= ReusedIdx.
test(Idx1);
4739 ReusedIdx.
set(Idx1);
4740 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4743 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4745 if (IE2 && IE2 != VU) {
4747 IsReusedIdx |= ReusedIdx.
test(Idx2);
4748 ReusedIdx.
set(Idx2);
4749 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4752 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4754 }
while (!IsReusedIdx && (IE1 || IE2));
4758std::optional<BoUpSLP::OrdersType>
4761 if (TE.isNonPowOf2Vec())
4762 return std::nullopt;
4766 if (!TE.ReuseShuffleIndices.empty()) {
4768 return std::nullopt;
4776 unsigned Sz = TE.Scalars.size();
4777 if (TE.State == TreeEntry::NeedToGather) {
4778 if (std::optional<OrdersType> CurrentOrder =
4783 ::addMask(Mask, TE.ReuseShuffleIndices);
4784 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4785 unsigned Sz = TE.Scalars.size();
4786 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4789 Res[
Idx + K * Sz] =
I + K * Sz;
4791 return std::move(Res);
4794 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4796 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4797 return std::nullopt;
4801 if (TE.ReorderIndices.empty())
4802 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4805 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4806 unsigned VF = ReorderMask.
size();
4810 for (
unsigned I = 0;
I < VF;
I += Sz) {
4812 unsigned UndefCnt = 0;
4813 unsigned Limit = std::min(Sz, VF -
I);
4822 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4824 return std::nullopt;
4826 for (
unsigned K = 0; K < NumParts; ++K)
4827 ResOrder[Val + Sz * K] =
I + K;
4829 return std::move(ResOrder);
4831 unsigned VF = TE.getVectorFactor();
4834 TE.ReuseShuffleIndices.end());
4835 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4837 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4838 return Idx && *Idx < Sz;
4841 if (TE.ReorderIndices.empty())
4842 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4845 for (
unsigned I = 0;
I < VF; ++
I) {
4846 int &
Idx = ReusedMask[
I];
4849 Value *V = TE.Scalars[ReorderMask[
Idx]];
4851 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4857 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4858 auto *It = ResOrder.
begin();
4859 for (
unsigned K = 0; K < VF; K += Sz) {
4863 std::iota(SubMask.begin(), SubMask.end(), 0);
4865 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4866 std::advance(It, Sz);
4868 if (TE.State == TreeEntry::NeedToGather &&
4870 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4871 return std::nullopt;
4872 return std::move(ResOrder);
4874 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4875 any_of(TE.UserTreeIndices,
4877 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4879 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4880 return std::nullopt;
4881 if ((TE.State == TreeEntry::Vectorize ||
4882 TE.State == TreeEntry::StridedVectorize) &&
4883 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4884 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4886 return TE.ReorderIndices;
4887 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4888 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4889 Value *V1 = TE.Scalars[I1];
4890 Value *V2 = TE.Scalars[I2];
4891 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4897 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4898 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4899 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4900 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4907 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4908 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4909 if (EE1->getOperand(0) != EE2->getOperand(0))
4915 auto IsIdentityOrder = [](
const OrdersType &Order) {
4916 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4921 if (!TE.ReorderIndices.empty())
4922 return TE.ReorderIndices;
4925 std::iota(Phis.begin(), Phis.end(), 0);
4927 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4930 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4931 ResOrder[Id] = PhiToId[Phis[Id]];
4932 if (IsIdentityOrder(ResOrder))
4933 return std::nullopt;
4934 return std::move(ResOrder);
4936 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4940 if ((TE.getOpcode() == Instruction::ExtractElement ||
4941 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4942 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4944 auto *EE = dyn_cast<ExtractElementInst>(V);
4945 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4950 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4952 if (Reuse || !CurrentOrder.
empty())
4953 return std::move(CurrentOrder);
4961 int Sz = TE.Scalars.size();
4963 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4965 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4966 if (It == TE.Scalars.begin())
4969 if (It != TE.Scalars.end()) {
4971 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4986 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4989 return std::move(Order);
4994 return std::nullopt;
4995 if (TE.Scalars.size() >= 4)
4999 return CurrentOrder;
5001 return std::nullopt;
5011 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5013 if (Cluster != FirstCluster)
5019void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5022 const unsigned Sz =
TE.Scalars.size();
5024 if (
TE.State != TreeEntry::NeedToGather ||
5031 addMask(NewMask,
TE.ReuseShuffleIndices);
5033 TE.ReorderIndices.clear();
5040 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5041 *
End =
TE.ReuseShuffleIndices.end();
5042 It !=
End; std::advance(It, Sz))
5043 std::iota(It, std::next(It, Sz), 0);
5049 "Expected same size of orders");
5050 unsigned Sz = Order.
size();
5052 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5053 if (Order[
Idx] != Sz)
5054 UsedIndices.
set(Order[
Idx]);
5056 if (SecondaryOrder.
empty()) {
5057 for (
unsigned Idx : seq<unsigned>(0, Sz))
5058 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5061 for (
unsigned Idx : seq<unsigned>(0, Sz))
5062 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5063 !UsedIndices.
test(SecondaryOrder[
Idx]))
5064 Order[
Idx] = SecondaryOrder[
Idx];
5084 ExternalUserReorderMap;
5089 const std::unique_ptr<TreeEntry> &TE) {
5092 findExternalStoreUsersReorderIndices(TE.get());
5093 if (!ExternalUserReorderIndices.
empty()) {
5094 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5096 std::move(ExternalUserReorderIndices));
5102 if (TE->isAltShuffle()) {
5105 unsigned Opcode0 = TE->getOpcode();
5106 unsigned Opcode1 = TE->getAltOpcode();
5109 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5110 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5116 if (std::optional<OrdersType> CurrentOrder =
5126 const TreeEntry *UserTE = TE.get();
5128 if (UserTE->UserTreeIndices.size() != 1)
5131 return EI.UserTE->State == TreeEntry::Vectorize &&
5132 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5135 UserTE = UserTE->UserTreeIndices.back().UserTE;
5138 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5139 if (!(TE->State == TreeEntry::Vectorize ||
5140 TE->State == TreeEntry::StridedVectorize) ||
5141 !TE->ReuseShuffleIndices.empty())
5142 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5143 if (TE->State == TreeEntry::Vectorize &&
5144 TE->getOpcode() == Instruction::PHI)
5145 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5150 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5152 auto It = VFToOrderedEntries.
find(VF);
5153 if (It == VFToOrderedEntries.
end())
5165 for (
const TreeEntry *OpTE : OrderedEntries) {
5168 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5171 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5173 if (OpTE->State == TreeEntry::NeedToGather ||
5174 !OpTE->ReuseShuffleIndices.empty()) {
5175 auto It = GathersToOrders.find(OpTE);
5176 if (It != GathersToOrders.end())
5179 if (OpTE->isAltShuffle()) {
5180 auto It = AltShufflesToOrders.find(OpTE);
5181 if (It != AltShufflesToOrders.end())
5184 if (OpTE->State == TreeEntry::Vectorize &&
5185 OpTE->getOpcode() == Instruction::PHI) {
5186 auto It = PhisToOrders.
find(OpTE);
5187 if (It != PhisToOrders.
end())
5190 return OpTE->ReorderIndices;
5193 auto It = ExternalUserReorderMap.
find(OpTE);
5194 if (It != ExternalUserReorderMap.
end()) {
5195 const auto &ExternalUserReorderIndices = It->second;
5199 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5200 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5201 ExternalUserReorderIndices.size();
5203 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5204 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5211 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5212 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5215 unsigned E = Order.size();
5218 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5221 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5223 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5226 if (OrdersUses.empty())
5229 const unsigned Sz = Order.size();
5230 for (
unsigned Idx : seq<unsigned>(0, Sz))
5231 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5236 unsigned IdentityCnt = 0;
5237 unsigned FilledIdentityCnt = 0;
5239 for (
auto &Pair : OrdersUses) {
5240 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5241 if (!Pair.first.empty())
5242 FilledIdentityCnt += Pair.second;
5243 IdentityCnt += Pair.second;
5248 unsigned Cnt = IdentityCnt;
5249 for (
auto &Pair : OrdersUses) {
5253 if (Cnt < Pair.second ||
5254 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5255 Cnt == Pair.second && !BestOrder.
empty() &&
5256 IsIdentityOrder(BestOrder))) {
5258 BestOrder = Pair.first;
5265 if (IsIdentityOrder(BestOrder))
5271 unsigned E = BestOrder.
size();
5273 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5276 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5278 if (TE->Scalars.size() != VF) {
5279 if (TE->ReuseShuffleIndices.size() == VF) {
5285 return EI.UserTE->Scalars.size() == VF ||
5286 EI.UserTE->Scalars.size() ==
5289 "All users must be of VF size.");
5292 reorderNodeWithReuses(*TE, Mask);
5296 if ((TE->State == TreeEntry::Vectorize ||
5297 TE->State == TreeEntry::StridedVectorize) &&
5300 !TE->isAltShuffle()) {
5304 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5305 TE->reorderOperands(Mask);
5308 TE->reorderOperands(Mask);
5309 assert(TE->ReorderIndices.empty() &&
5310 "Expected empty reorder sequence.");
5313 if (!TE->ReuseShuffleIndices.empty()) {
5320 addMask(NewReuses, TE->ReuseShuffleIndices);
5321 TE->ReuseShuffleIndices.swap(NewReuses);
5327bool BoUpSLP::canReorderOperands(
5328 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5332 if (UserTE->isNonPowOf2Vec())
5335 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5336 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5337 return OpData.first ==
I &&
5338 (OpData.second->State == TreeEntry::Vectorize ||
5339 OpData.second->State == TreeEntry::StridedVectorize);
5342 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5344 if (
any_of(TE->UserTreeIndices,
5345 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5349 Edges.emplace_back(
I, TE);
5355 if (TE->State != TreeEntry::Vectorize &&
5356 TE->State != TreeEntry::StridedVectorize &&
5357 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5361 TreeEntry *
Gather =
nullptr;
5363 [&
Gather, UserTE,
I](TreeEntry *TE) {
5364 assert(TE->State != TreeEntry::Vectorize &&
5365 TE->State != TreeEntry::StridedVectorize &&
5366 "Only non-vectorized nodes are expected.");
5367 if (
any_of(TE->UserTreeIndices,
5368 [UserTE,
I](
const EdgeInfo &EI) {
5369 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5371 assert(TE->isSame(UserTE->getOperand(
I)) &&
5372 "Operand entry does not match operands.");
5393 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5394 if (TE->State != TreeEntry::Vectorize &&
5395 TE->State != TreeEntry::StridedVectorize)
5397 if (std::optional<OrdersType> CurrentOrder =
5399 OrderedEntries.
insert(TE.get());
5400 if (!(TE->State == TreeEntry::Vectorize ||
5401 TE->State == TreeEntry::StridedVectorize) ||
5402 !TE->ReuseShuffleIndices.empty())
5403 GathersToOrders.
insert(TE.get());
5412 while (!OrderedEntries.
empty()) {
5417 for (TreeEntry *TE : OrderedEntries) {
5418 if (!(TE->State == TreeEntry::Vectorize ||
5419 TE->State == TreeEntry::StridedVectorize ||
5420 (TE->State == TreeEntry::NeedToGather &&
5422 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5425 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5427 !Visited.
insert(TE).second) {
5433 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5434 TreeEntry *UserTE = EI.
UserTE;
5435 auto It =
Users.find(UserTE);
5436 if (It ==
Users.end())
5437 It =
Users.insert({UserTE, {}}).first;
5438 It->second.emplace_back(EI.
EdgeIdx, TE);
5442 for (TreeEntry *TE : Filtered)
5443 OrderedEntries.remove(TE);
5445 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5447 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5448 return Data1.first->Idx > Data2.first->Idx;
5450 for (
auto &
Data : UsersVec) {
5453 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5455 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5456 OrderedEntries.remove(
Op.second);
5469 for (
const auto &
Op :
Data.second) {
5470 TreeEntry *OpTE =
Op.second;
5471 if (!VisitedOps.
insert(OpTE).second)
5473 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5475 const auto Order = [&]() ->
const OrdersType {
5476 if (OpTE->State == TreeEntry::NeedToGather ||
5477 !OpTE->ReuseShuffleIndices.empty())
5480 return OpTE->ReorderIndices;
5484 if (Order.size() == 1)
5487 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5488 return P.second == OpTE;
5491 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5492 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5495 unsigned E = Order.size();
5498 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5501 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5504 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5506 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5507 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5509 if (TE->isNonPowOf2Vec())
5511 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5512 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5513 (IgnoreReorder && TE->Idx == 0))
5515 if (TE->State == TreeEntry::NeedToGather) {
5524 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5525 TreeEntry *UserTE = EI.
UserTE;
5526 if (!VisitedUsers.
insert(UserTE).second)
5531 if (AllowsReordering(UserTE))
5539 if (
static_cast<unsigned>(
count_if(
5540 Ops, [UserTE, &AllowsReordering](
5541 const std::pair<unsigned, TreeEntry *> &
Op) {
5542 return AllowsReordering(
Op.second) &&
5545 return EI.UserTE == UserTE;
5547 })) <= Ops.
size() / 2)
5548 ++Res.first->second;
5551 if (OrdersUses.empty()) {
5552 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5553 OrderedEntries.remove(
Op.second);
5557 const unsigned Sz = Order.size();
5558 for (
unsigned Idx : seq<unsigned>(0, Sz))
5559 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5564 unsigned IdentityCnt = 0;
5565 unsigned VF =
Data.second.front().second->getVectorFactor();
5567 for (
auto &Pair : OrdersUses) {
5568 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5569 IdentityCnt += Pair.second;
5574 unsigned Cnt = IdentityCnt;
5575 for (
auto &Pair : OrdersUses) {
5579 if (Cnt < Pair.second) {
5581 BestOrder = Pair.first;
5588 if (IsIdentityOrder(BestOrder)) {
5589 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5590 OrderedEntries.remove(
Op.second);
5599 unsigned E = BestOrder.
size();
5601 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5603 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5604 TreeEntry *TE =
Op.second;
5605 OrderedEntries.remove(TE);
5606 if (!VisitedOps.
insert(TE).second)
5608 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5609 reorderNodeWithReuses(*TE, Mask);
5613 if (TE->State != TreeEntry::Vectorize &&
5614 TE->State != TreeEntry::StridedVectorize &&
5615 (TE->State != TreeEntry::ScatterVectorize ||
5616 TE->ReorderIndices.empty()))
5618 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5619 TE->ReorderIndices.empty()) &&
5620 "Non-matching sizes of user/operand entries.");
5622 if (IgnoreReorder && TE == VectorizableTree.front().get())
5623 IgnoreReorder =
false;
5626 for (TreeEntry *
Gather : GatherOps) {
5628 "Unexpected reordering of gathers.");
5629 if (!
Gather->ReuseShuffleIndices.empty()) {
5635 OrderedEntries.remove(
Gather);
5639 if (
Data.first->State != TreeEntry::Vectorize ||
5640 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5641 Data.first->getMainOp()) ||
5642 Data.first->isAltShuffle())
5643 Data.first->reorderOperands(Mask);
5644 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5645 Data.first->isAltShuffle() ||
5646 Data.first->State == TreeEntry::StridedVectorize) {
5650 if (
Data.first->ReuseShuffleIndices.empty() &&
5651 !
Data.first->ReorderIndices.empty() &&
5652 !
Data.first->isAltShuffle()) {
5655 OrderedEntries.insert(
Data.first);
5663 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5664 VectorizableTree.front()->ReuseShuffleIndices.empty())
5665 VectorizableTree.front()->ReorderIndices.clear();
5672 for (
auto &TEPtr : VectorizableTree) {
5673 TreeEntry *Entry = TEPtr.get();
5676 if (Entry->State == TreeEntry::NeedToGather)
5680 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5681 Value *Scalar = Entry->Scalars[Lane];
5682 if (!isa<Instruction>(Scalar))
5685 auto It = ScalarToExtUses.
find(Scalar);
5686 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5690 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5691 if (ExtI != ExternallyUsedValues.
end()) {
5692 int FoundLane = Entry->findLaneForValue(Scalar);
5693 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5694 << FoundLane <<
" from " << *Scalar <<
".\n");
5695 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5696 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5699 for (
User *U : Scalar->users()) {
5707 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5711 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5715 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5717 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5718 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5720 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5724 if (It != ScalarToExtUses.
end()) {
5725 ExternalUses[It->second].User =
nullptr;
5730 int FoundLane = Entry->findLaneForValue(Scalar);
5732 <<
" from lane " << FoundLane <<
" from " << *Scalar
5734 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5735 ExternalUses.emplace_back(Scalar, U, FoundLane);
5744BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5746 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5747 Value *V = TE->Scalars[Lane];
5753 for (
User *U : V->users()) {
5754 auto *SI = dyn_cast<StoreInst>(U);
5755 if (SI ==
nullptr || !SI->isSimple() ||
5759 if (getTreeEntry(U))
5763 auto &StoresVec = PtrToStoresMap[
Ptr];
5766 if (StoresVec.size() > Lane)
5769 if (!StoresVec.empty() &&
5770 SI->getParent() != StoresVec.back()->getParent())
5773 if (!StoresVec.empty() &&
5774 SI->getValueOperand()->getType() !=
5775 StoresVec.back()->getValueOperand()->getType())
5777 StoresVec.push_back(SI);
5780 return PtrToStoresMap;
5784 OrdersType &ReorderIndices)
const {
5792 StoreOffsetVec[0] = {S0, 0};
5795 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5797 std::optional<int> Diff =
5799 SI->getPointerOperand(), *
DL, *SE,
5804 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5809 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5810 const std::pair<StoreInst *, int> &Pair2) {
5811 int Offset1 = Pair1.second;
5812 int Offset2 = Pair2.second;
5813 return Offset1 < Offset2;
5817 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5818 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5823 ReorderIndices.reserve(StoresVec.
size());
5826 [SI](
const std::pair<StoreInst *, int> &Pair) {
5827 return Pair.first ==
SI;
5829 StoreOffsetVec.begin();
5830 ReorderIndices.push_back(
Idx);
5835 auto IsIdentityOrder = [](
const OrdersType &Order) {
5836 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5841 if (IsIdentityOrder(ReorderIndices))
5842 ReorderIndices.clear();
5849 for (
unsigned Idx : Order)
5856BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5857 unsigned NumLanes =
TE->Scalars.size();
5860 collectUserStores(TE);
5869 for (
const auto &Pair : PtrToStoresMap) {
5870 auto &StoresVec = Pair.second;
5872 if (StoresVec.size() != NumLanes)
5877 if (!canFormVector(StoresVec, ReorderIndices))
5882 ExternalReorderIndices.
push_back(ReorderIndices);
5884 return ExternalReorderIndices;
5890 UserIgnoreList = &UserIgnoreLst;
5893 buildTree_rec(Roots, 0,
EdgeInfo());
5900 buildTree_rec(Roots, 0,
EdgeInfo());
5907 Value *NeedsScheduling =
nullptr;
5908 for (
Value *V : VL) {
5911 if (!NeedsScheduling) {
5912 NeedsScheduling = V;
5917 return NeedsScheduling;
5928 bool AllowAlternate) {
5932 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5935 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5940 if (isa<ExtractElementInst, UndefValue>(V))
5942 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5944 !isa<UndefValue>(EI->getIndexOperand()))
5947 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5950 if ((isa<BinaryOperator, CastInst>(
I)) &&
5960 : cast<CastInst>(
I)->getOperand(0)->getType()));
5962 if (isa<CastInst>(
I)) {
5963 std::pair<size_t, size_t> OpVals =
5969 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5971 if (CI->isCommutative())
5977 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5991 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5992 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5993 SubKey =
hash_value(Gep->getPointerOperand());
5997 !isa<ConstantInt>(
I->getOperand(1))) {
6005 return std::make_pair(Key, SubKey);
6015bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
6017 unsigned Opcode0 = S.getOpcode();
6018 unsigned Opcode1 = S.getAltOpcode();
6022 Opcode0, Opcode1, OpcodeMask))
6025 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
6029 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
6033 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
6039 switch (Res.value_or(0)) {
6054 constexpr unsigned NumAltInsts = 3;
6055 unsigned NonInstCnt = 0;
6058 unsigned UndefCnt = 0;
6060 unsigned ExtraShuffleInsts = 0;
6069 return is_contained(Operands.back(), V);
6072 ++ExtraShuffleInsts;
6089 if (isa<Constant, ExtractElementInst>(V) ||
6090 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
6091 if (isa<UndefValue>(V))
6097 if (!Res.second && Res.first->second == 1)
6098 ++ExtraShuffleInsts;
6099 ++Res.first->getSecond();
6100 if (
auto *
I = dyn_cast<Instruction>(V))
6101 UniqueOpcodes.
insert(
I->getOpcode());
6102 else if (Res.second)
6105 return none_of(Uniques, [&](
const auto &
P) {
6106 return P.first->hasNUsesOrMore(
P.second + 1) &&
6108 return getTreeEntry(U) || Uniques.contains(U);
6117 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6118 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
6119 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6122BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6125 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
6127 unsigned ShuffleOrOp =
6128 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
6129 auto *VL0 = cast<Instruction>(S.OpValue);
6130 switch (ShuffleOrOp) {
6131 case Instruction::PHI: {
6134 return TreeEntry::NeedToGather;
6137 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
6139 if (Term &&
Term->isTerminator()) {
6141 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
6142 return TreeEntry::NeedToGather;
6146 return TreeEntry::Vectorize;
6148 case Instruction::ExtractValue:
6149 case Instruction::ExtractElement: {
6150 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6153 return TreeEntry::NeedToGather;
6154 if (Reuse || !CurrentOrder.empty())
6155 return TreeEntry::Vectorize;
6157 return TreeEntry::NeedToGather;
6159 case Instruction::InsertElement: {
6163 for (
Value *V : VL) {
6164 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6166 "Non-constant or undef index?");
6170 return !SourceVectors.contains(V);
6173 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6174 "different source vectors.\n");
6175 return TreeEntry::NeedToGather;
6178 return TreeEntry::Vectorize;
6180 case Instruction::Load: {
6189 return TreeEntry::Vectorize;
6191 return TreeEntry::ScatterVectorize;
6193 return TreeEntry::StridedVectorize;
6196 Type *ScalarTy = VL0->getType();
6197 if (
DL->getTypeSizeInBits(ScalarTy) !=
6198 DL->getTypeAllocSizeInBits(ScalarTy))
6199 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6201 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6206 return TreeEntry::NeedToGather;
6210 case Instruction::ZExt:
6211 case Instruction::SExt:
6212 case Instruction::FPToUI:
6213 case Instruction::FPToSI:
6214 case Instruction::FPExt:
6215 case Instruction::PtrToInt:
6216 case Instruction::IntToPtr:
6217 case Instruction::SIToFP:
6218 case Instruction::UIToFP:
6219 case Instruction::Trunc:
6220 case Instruction::FPTrunc:
6221 case Instruction::BitCast: {
6222 Type *SrcTy = VL0->getOperand(0)->getType();
6223 for (
Value *V : VL) {
6224 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6227 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6228 return TreeEntry::NeedToGather;
6231 return TreeEntry::Vectorize;
6233 case Instruction::ICmp:
6234 case Instruction::FCmp: {
6238 Type *ComparedTy = VL0->getOperand(0)->getType();
6239 for (
Value *V : VL) {
6241 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6242 Cmp->getOperand(0)->getType() != ComparedTy) {
6243 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6244 return TreeEntry::NeedToGather;
6247 return TreeEntry::Vectorize;
6249 case Instruction::Select:
6250 case Instruction::FNeg:
6251 case Instruction::Add:
6252 case Instruction::FAdd:
6253 case Instruction::Sub:
6254 case Instruction::FSub:
6255 case Instruction::Mul:
6256 case Instruction::FMul:
6257 case Instruction::UDiv:
6258 case Instruction::SDiv:
6259 case Instruction::FDiv:
6260 case Instruction::URem:
6261 case Instruction::SRem:
6262 case Instruction::FRem:
6263 case Instruction::Shl:
6264 case Instruction::LShr:
6265 case Instruction::AShr:
6266 case Instruction::And:
6267 case Instruction::Or:
6268 case Instruction::Xor:
6269 return TreeEntry::Vectorize;
6270 case Instruction::GetElementPtr: {
6272 for (
Value *V : VL) {
6273 auto *
I = dyn_cast<GetElementPtrInst>(V);
6276 if (
I->getNumOperands() != 2) {
6277 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6278 return TreeEntry::NeedToGather;
6284 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6285 for (
Value *V : VL) {
6286 auto *
GEP = dyn_cast<GEPOperator>(V);
6289 Type *CurTy =
GEP->getSourceElementType();
6291 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6292 return TreeEntry::NeedToGather;
6297 Type *Ty1 = VL0->getOperand(1)->getType();
6298 for (
Value *V : VL) {
6299 auto *
I = dyn_cast<GetElementPtrInst>(V);
6302 auto *
Op =
I->getOperand(1);
6303 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6304 (
Op->getType() != Ty1 &&
6305 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6306 Op->getType()->getScalarSizeInBits() >
6307 DL->getIndexSizeInBits(
6308 V->getType()->getPointerAddressSpace())))) {
6310 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6311 return TreeEntry::NeedToGather;
6315 return TreeEntry::Vectorize;
6317 case Instruction::Store: {
6319 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6322 if (
DL->getTypeSizeInBits(ScalarTy) !=
6323 DL->getTypeAllocSizeInBits(ScalarTy)) {
6324 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6325 return TreeEntry::NeedToGather;
6329 for (
Value *V : VL) {
6330 auto *
SI = cast<StoreInst>(V);
6331 if (!
SI->isSimple()) {
6333 return TreeEntry::NeedToGather;
6342 if (CurrentOrder.empty()) {
6343 Ptr0 = PointerOps.
front();
6344 PtrN = PointerOps.
back();
6346 Ptr0 = PointerOps[CurrentOrder.front()];
6347 PtrN = PointerOps[CurrentOrder.back()];
6349 std::optional<int> Dist =
6352 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6353 return TreeEntry::Vectorize;
6357 return TreeEntry::NeedToGather;
6359 case Instruction::Call: {
6362 CallInst *CI = cast<CallInst>(VL0);
6373 return TreeEntry::NeedToGather;
6378 for (
unsigned J = 0; J != NumArgs; ++J)
6381 for (
Value *V : VL) {
6382 CallInst *CI2 = dyn_cast<CallInst>(V);
6388 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6390 return TreeEntry::NeedToGather;
6394 for (
unsigned J = 0; J != NumArgs; ++J) {
6397 if (ScalarArgs[J] != A1J) {
6399 <<
"SLP: mismatched arguments in call:" << *CI
6400 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6401 return TreeEntry::NeedToGather;
6410 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6411 <<
"!=" << *V <<
'\n');
6412 return TreeEntry::NeedToGather;
6416 return TreeEntry::Vectorize;
6418 case Instruction::ShuffleVector: {
6421 if (!S.isAltShuffle()) {
6422 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6423 return TreeEntry::NeedToGather;
6428 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6429 "the whole alt sequence is not profitable.\n");
6430 return TreeEntry::NeedToGather;
6433 return TreeEntry::Vectorize;
6437 return TreeEntry::NeedToGather;
6451 PHIHandler() =
delete;
6453 : DT(DT), Main(Main), Phis(Phis),
6454 Operands(Main->getNumIncomingValues(),
6456 void buildOperands() {
6457 constexpr unsigned FastLimit = 4;
6467 auto *
P = cast<PHINode>(V);
6468 if (
P->getIncomingBlock(
I) == InBB)
6483 Blocks.try_emplace(InBB).first->second.push_back(
I);
6486 auto *
P = cast<PHINode>(V);
6487 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
6495 auto It =
Blocks.find(InBB);
6501 for (
const auto &
P :
Blocks) {
6502 if (
P.getSecond().size() <= 1)
6504 unsigned BasicI =
P.getSecond().front();
6507 [&](
const auto &Data) {
6508 return !Data.value() ||
6509 Data.value() ==
Operands[BasicI][Data.index()];
6511 "Expected empty operands list.");
6521 const EdgeInfo &UserTreeIdx) {
6527 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6528 bool DoNotFail =
false) {
6531 for (
Value *V : VL) {
6538 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6543 size_t NumUniqueScalarValues = UniqueValues.
size();
6544 if (NumUniqueScalarValues == VL.size()) {
6545 ReuseShuffleIndices.
clear();
6548 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6549 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6550 "for nodes with padding.\n");
6551 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6555 if (NumUniqueScalarValues <= 1 ||
6556 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6558 return isa<UndefValue>(V) ||
6561 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6562 if (DoNotFail && UniquePositions.size() > 1 &&
6563 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6565 return isa<ExtractElementInst>(V) ||
6566 areAllUsersVectorized(cast<Instruction>(V),
6570 if (PWSz == VL.size()) {
6571 ReuseShuffleIndices.
clear();
6573 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6574 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6575 UniqueValues.
back());
6576 VL = NonUniqueValueVL;
6581 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6592 if (!EphValues.
empty()) {
6593 for (
Value *V : VL) {
6594 if (EphValues.
count(V)) {
6596 <<
") is ephemeral.\n");
6597 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6607 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6612 cast<Instruction>(
I)->getOpcode() ==
6613 cast<Instruction>(S.MainOp)->getOpcode();
6615 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6616 if (TryToFindDuplicates(S))
6617 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6618 ReuseShuffleIndices);
6623 if (S.getOpcode() == Instruction::ExtractElement &&
6624 isa<ScalableVectorType>(
6625 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6626 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6627 if (TryToFindDuplicates(S))
6628 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6629 ReuseShuffleIndices);
6634 if (S.OpValue->getType()->isVectorTy() &&
6635 !isa<InsertElementInst>(S.OpValue)) {
6637 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6641 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6642 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6643 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6644 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6653 auto &&NotProfitableForVectorization = [&S,
this,
6655 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6664 for (
Value *V : VL) {
6665 auto *
I = cast<Instruction>(V);
6667 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6671 if ((IsCommutative &&
6672 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6674 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6676 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6678 auto *
I1 = cast<Instruction>(VL.front());
6679 auto *I2 = cast<Instruction>(VL.back());
6682 I2->getOperand(
Op));
6683 if (
static_cast<unsigned>(
count_if(
6684 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6686 })) >= S.MainOp->getNumOperands() / 2)
6688 if (S.MainOp->getNumOperands() > 2)
6690 if (IsCommutative) {
6695 I2->getOperand((
Op + 1) % E));
6697 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6706 bool IsScatterVectorizeUserTE =
6707 UserTreeIdx.UserTE &&
6708 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6709 bool AreAllSameInsts =
6711 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6715 auto *
I = dyn_cast<GetElementPtrInst>(V);
6719 BB =
I->getParent();
6720 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6723 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6726 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6729 NotProfitableForVectorization(VL)) {
6730 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6731 if (TryToFindDuplicates(S))
6732 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6733 ReuseShuffleIndices);
6741 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6742 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6743 if (!E->isSame(VL)) {
6744 auto It = MultiNodeScalars.
find(S.OpValue);
6745 if (It != MultiNodeScalars.
end()) {
6746 auto *TEIt =
find_if(It->getSecond(),
6747 [&](TreeEntry *ME) { return ME->isSame(VL); });
6748 if (TEIt != It->getSecond().end())
6758 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6759 if (TryToFindDuplicates(S))
6760 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6761 ReuseShuffleIndices);
6767 E->UserTreeIndices.push_back(UserTreeIdx);
6768 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6775 for (
Value *V : VL) {
6776 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6779 if (getTreeEntry(V)) {
6781 <<
") is already in tree.\n");
6782 if (TryToFindDuplicates(S))
6783 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6784 ReuseShuffleIndices);
6790 if (UserIgnoreList && !UserIgnoreList->empty()) {
6791 for (
Value *V : VL) {
6792 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6793 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6794 if (TryToFindDuplicates(S))
6795 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6796 ReuseShuffleIndices);
6804 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6805 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6807 assert(S.OpValue->getType()->isPointerTy() &&
6808 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6809 "Expected pointers only.");
6811 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6812 assert(It != VL.end() &&
"Expected at least one GEP.");
6818 auto *VL0 = cast<Instruction>(S.OpValue);
6825 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6834 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6839 if (!TryToFindDuplicates(S,
true))
6845 TreeEntry::EntryState State = getScalarsVectorizationState(
6846 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6847 if (State == TreeEntry::NeedToGather) {
6848 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6849 ReuseShuffleIndices);
6853 auto &BSRef = BlocksSchedules[BB];
6855 BSRef = std::make_unique<BlockScheduling>(BB);
6857 BlockScheduling &BS = *BSRef;
6859 std::optional<ScheduleData *> Bundle =
6860 BS.tryScheduleBundle(UniqueValues,
this, S);
6861#ifdef EXPENSIVE_CHECKS
6866 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6867 assert((!BS.getScheduleData(VL0) ||
6868 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6869 "tryScheduleBundle should cancelScheduling on failure");
6870 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6871 ReuseShuffleIndices);
6872 NonScheduledFirst.insert(VL.front());
6875 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6877 unsigned ShuffleOrOp = S.isAltShuffle() ?
6878 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6879 switch (ShuffleOrOp) {
6880 case Instruction::PHI: {
6881 auto *PH = cast<PHINode>(VL0);
6884 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
6888 PHIHandler Handler(*DT, PH, VL);
6889 Handler.buildOperands();
6890 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6891 TE->setOperand(
I, Handler.getOperands(
I));
6892 for (
unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6893 buildTree_rec(Handler.getOperands(
I),
Depth + 1, {TE, I});
6896 case Instruction::ExtractValue:
6897 case Instruction::ExtractElement: {
6898 if (CurrentOrder.empty()) {
6899 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6902 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6904 for (
unsigned Idx : CurrentOrder)
6912 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6913 ReuseShuffleIndices, CurrentOrder);
6917 Op0.
assign(VL.size(), VL0->getOperand(0));
6918 VectorizableTree.back()->setOperand(0, Op0);
6921 case Instruction::InsertElement: {
6922 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
6924 auto OrdCompare = [](
const std::pair<int, int> &P1,
6925 const std::pair<int, int> &P2) {
6926 return P1.first > P2.first;
6929 decltype(OrdCompare)>
6930 Indices(OrdCompare);
6931 for (
int I = 0, E = VL.size();
I < E; ++
I) {
6933 Indices.emplace(
Idx,
I);
6935 OrdersType CurrentOrder(VL.size(), VL.size());
6936 bool IsIdentity =
true;
6937 for (
int I = 0, E = VL.size();
I < E; ++
I) {
6938 CurrentOrder[Indices.top().second] =
I;
6939 IsIdentity &= Indices.top().second ==
I;
6943 CurrentOrder.clear();
6944 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6945 std::nullopt, CurrentOrder);
6948 TE->setOperandsInOrder();
6949 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
6952 case Instruction::Load: {
6959 TreeEntry *
TE =
nullptr;
6962 case TreeEntry::Vectorize:
6963 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6964 ReuseShuffleIndices, CurrentOrder);
6965 if (CurrentOrder.empty())
6969 TE->setOperandsInOrder();
6971 case TreeEntry::StridedVectorize:
6973 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6974 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
6975 TE->setOperandsInOrder();
6978 case TreeEntry::ScatterVectorize:
6980 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6981 UserTreeIdx, ReuseShuffleIndices);
6982 TE->setOperandsInOrder();
6983 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6984 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6986 case TreeEntry::NeedToGather:
6991 case Instruction::ZExt:
6992 case Instruction::SExt:
6993 case Instruction::FPToUI:
6994 case Instruction::FPToSI:
6995 case Instruction::FPExt:
6996 case Instruction::PtrToInt:
6997 case Instruction::IntToPtr:
6998 case Instruction::SIToFP:
6999 case Instruction::UIToFP:
7000 case Instruction::Trunc:
7001 case Instruction::FPTrunc:
7002 case Instruction::BitCast: {
7003 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7004 std::make_pair(std::numeric_limits<unsigned>::min(),
7005 std::numeric_limits<unsigned>::max()));
7006 if (ShuffleOrOp == Instruction::ZExt ||
7007 ShuffleOrOp == Instruction::SExt) {
7008 CastMaxMinBWSizes = std::make_pair(
7014 }
else if (ShuffleOrOp == Instruction::Trunc) {
7015 CastMaxMinBWSizes = std::make_pair(
7021 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7022 }
else if (ShuffleOrOp == Instruction::SIToFP ||
7023 ShuffleOrOp == Instruction::UIToFP) {
7024 unsigned NumSignBits =
7026 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7028 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
7030 if (NumSignBits * 2 >=
7032 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
7034 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7035 ReuseShuffleIndices);
7038 TE->setOperandsInOrder();
7039 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7040 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7043 case Instruction::ICmp:
7044 case Instruction::FCmp: {
7047 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7048 ReuseShuffleIndices);
7056 "Commutative Predicate mismatch");
7057 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7060 for (
Value *V : VL) {
7061 auto *
Cmp = cast<CmpInst>(V);
7064 if (
Cmp->getPredicate() != P0)
7066 Left.push_back(LHS);
7067 Right.push_back(RHS);
7074 if (ShuffleOrOp == Instruction::ICmp) {
7075 unsigned NumSignBits0 =
7077 if (NumSignBits0 * 2 >=
7079 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
7080 unsigned NumSignBits1 =
7082 if (NumSignBits1 * 2 >=
7084 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
7088 case Instruction::Select:
7089 case Instruction::FNeg:
7090 case Instruction::Add:
7091 case Instruction::FAdd:
7092 case Instruction::Sub:
7093 case Instruction::FSub:
7094 case Instruction::Mul:
7095 case Instruction::FMul:
7096 case Instruction::UDiv:
7097 case Instruction::SDiv:
7098 case Instruction::FDiv:
7099 case Instruction::URem:
7100 case Instruction::SRem:
7101 case Instruction::FRem:
7102 case Instruction::Shl:
7103 case Instruction::LShr:
7104 case Instruction::AShr:
7105 case Instruction::And:
7106 case Instruction::Or:
7107 case Instruction::Xor: {
7108 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7109 ReuseShuffleIndices);
7116 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7124 TE->setOperandsInOrder();
7125 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7126 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7129 case Instruction::GetElementPtr: {
7130 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7131 ReuseShuffleIndices);
7135 for (
Value *V : VL) {
7136 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7141 Operands.front().push_back(
GEP->getPointerOperand());
7150 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7152 [VL0Ty, IndexIdx](
Value *V) {
7153 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
7156 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
7160 ->getPointerOperandType()
7163 for (
Value *V : VL) {
7164 auto *
I = dyn_cast<GetElementPtrInst>(V);
7167 ConstantInt::get(Ty, 0,
false));
7170 auto *
Op =
I->getOperand(IndexIdx);
7171 auto *CI = dyn_cast<ConstantInt>(
Op);
7176 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7180 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7184 case Instruction::Store: {
7185 bool Consecutive = CurrentOrder.empty();
7188 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7189 ReuseShuffleIndices, CurrentOrder);
7190 TE->setOperandsInOrder();
7191 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
7195 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7198 case Instruction::Call: {
7201 CallInst *CI = cast<CallInst>(VL0);
7204 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7205 ReuseShuffleIndices);
7210 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7214 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7218 for (
Value *V : VL) {
7219 auto *CI2 = cast<CallInst>(V);
7226 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7233 TE->setOperandsInOrder();
7234 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7241 for (
Value *V : VL) {
7242 auto *CI2 = cast<CallInst>(V);
7249 case Instruction::ShuffleVector: {
7250 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7251 ReuseShuffleIndices);
7255 auto *CI = dyn_cast<CmpInst>(VL0);
7256 if (isa<BinaryOperator>(VL0) || CI) {
7259 return cast<CmpInst>(V)->isCommutative();
7261 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7263 auto *MainCI = cast<CmpInst>(S.MainOp);
7264 auto *AltCI = cast<CmpInst>(S.AltOp);
7268 "Expected different main/alternate predicates.");
7271 for (
Value *V : VL) {
7272 auto *
Cmp = cast<CmpInst>(V);
7283 Left.push_back(LHS);
7284 Right.push_back(RHS);
7294 TE->setOperandsInOrder();
7295 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
7296 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
7309 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7310 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7312 for (
const auto *Ty : ST->elements())
7313 if (Ty != *ST->element_begin())
7315 N *= ST->getNumElements();
7316 EltTy = *ST->element_begin();
7317 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7318 N *= AT->getNumElements();
7319 EltTy = AT->getElementType();
7321 auto *VT = cast<FixedVectorType>(EltTy);
7322 N *= VT->getNumElements();
7323 EltTy = VT->getElementType();
7330 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7338 bool ResizeAllowed)
const {
7339 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7340 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7341 auto *E0 = cast<Instruction>(*It);
7343 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7347 Value *Vec = E0->getOperand(0);
7349 CurrentOrder.
clear();
7353 if (E0->getOpcode() == Instruction::ExtractValue) {
7358 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7362 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7365 unsigned E = VL.
size();
7366 if (!ResizeAllowed && NElts != E)
7369 unsigned MinIdx = NElts, MaxIdx = 0;
7371 auto *Inst = dyn_cast<Instruction>(V);
7374 if (Inst->getOperand(0) != Vec)
7376 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7377 if (isa<UndefValue>(EE->getIndexOperand()))
7382 const unsigned ExtIdx = *
Idx;
7383 if (ExtIdx >= NElts)
7385 Indices[
I] = ExtIdx;
7386 if (MinIdx > ExtIdx)
7388 if (MaxIdx < ExtIdx)
7391 if (MaxIdx - MinIdx + 1 > E)
7393 if (MaxIdx + 1 <= E)
7397 bool ShouldKeepOrder =
true;
7403 CurrentOrder.
assign(E, E);
7404 for (
unsigned I = 0;
I < E; ++
I) {
7407 const unsigned ExtIdx = Indices[
I] - MinIdx;
7408 if (CurrentOrder[ExtIdx] != E) {
7409 CurrentOrder.
clear();
7412 ShouldKeepOrder &= ExtIdx ==
I;
7413 CurrentOrder[ExtIdx] =
I;
7415 if (ShouldKeepOrder)
7416 CurrentOrder.
clear();
7418 return ShouldKeepOrder;
7421bool BoUpSLP::areAllUsersVectorized(
7423 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7425 return ScalarToTreeEntry.contains(U) ||
7426 isVectorLikeInstWithConstOps(U) ||
7427 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7431static std::pair<InstructionCost, InstructionCost>
7439 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7440 FMF = FPCI->getFastMathFlags();
7443 dyn_cast<IntrinsicInst>(CI));
7444 auto IntrinsicCost =
7451 auto LibCost = IntrinsicCost;
7458 return {IntrinsicCost, LibCost};
7461void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7465 unsigned Sz = Scalars.size();
7468 if (!ReorderIndices.empty())
7470 for (
unsigned I = 0;
I < Sz; ++
I) {
7472 if (!ReorderIndices.empty())
7474 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7475 if (IsAltOp(OpInst)) {
7485 if (!ReuseShuffleIndices.
empty()) {
7488 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7498 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7499 auto *AltCI = cast<CmpInst>(AltOp);
7502 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7503 auto *CI = cast<CmpInst>(
I);
7511 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7512 "CmpInst expected to match either main or alternate predicate or "
7515 return MainP !=
P && MainP != SwappedP;
7522 const auto *Op0 = Ops.
front();
7528 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7532 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7534 if (
auto *CI = dyn_cast<ConstantInt>(V))
7535 return CI->getValue().isPowerOf2();
7538 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7540 if (
auto *CI = dyn_cast<ConstantInt>(V))
7541 return CI->getValue().isNegatedPowerOf2();
7546 if (IsConstant && IsUniform)
7548 else if (IsConstant)
7562class BaseShuffleAnalysis {
7569 int Limit =
Mask.size();
7581 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7597 unsigned VF =
Mask.size();
7599 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7602 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7643 bool SinglePermute) {
7647 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7649 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7655 if (isIdentityMask(Mask, SVTy,
false)) {
7656 if (!IdentityOp || !SinglePermute ||
7657 (isIdentityMask(Mask, SVTy,
true) &&
7659 IdentityMask.
size()))) {
7664 IdentityMask.
assign(Mask);
7684 if (SV->isZeroEltSplat()) {
7686 IdentityMask.
assign(Mask);
7688 int LocalVF =
Mask.size();
7690 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7691 LocalVF = SVOpTy->getNumElements();
7695 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7697 ExtMask[
Idx] = SV->getMaskValue(
I);
7707 if (!IsOp1Undef && !IsOp2Undef) {
7709 for (
int &
I : Mask) {
7712 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7719 SV->getShuffleMask().end());
7720 combineMasks(LocalVF, ShuffleMask, Mask);
7721 Mask.swap(ShuffleMask);
7723 Op = SV->getOperand(0);
7725 Op = SV->getOperand(1);
7727 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7728 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7733 "Expected masks of same sizes.");
7738 Mask.swap(IdentityMask);
7739 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7740 return SinglePermute &&
7741 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7743 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7744 Shuffle->isZeroEltSplat() &&
7757 template <
typename T,
typename ShuffleBuilderTy>
7759 ShuffleBuilderTy &Builder) {
7760 assert(V1 &&
"Expected at least one vector value.");
7762 Builder.resizeToMatch(V1, V2);
7763 int VF =
Mask.size();
7764 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7765 VF = FTy->getNumElements();
7772 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7775 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7777 CombinedMask1[
I] =
Mask[
I];
7779 CombinedMask2[
I] =
Mask[
I] - VF;
7786 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7787 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7790 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7791 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7796 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7799 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7801 ExtMask1, UseMask::SecondArg);
7806 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7809 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7811 ExtMask2, UseMask::SecondArg);
7812 if (SV1->getOperand(0)->getType() ==
7813 SV2->getOperand(0)->getType() &&
7814 SV1->getOperand(0)->getType() != SV1->getType() &&
7817 Op1 = SV1->getOperand(0);
7818 Op2 = SV2->getOperand(0);
7820 SV1->getShuffleMask().end());
7821 int LocalVF = ShuffleMask1.size();
7822 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7823 LocalVF = FTy->getNumElements();
7824 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7825 CombinedMask1.swap(ShuffleMask1);
7827 SV2->getShuffleMask().end());
7828 LocalVF = ShuffleMask2.size();
7829 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7830 LocalVF = FTy->getNumElements();
7831 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7832 CombinedMask2.swap(ShuffleMask2);
7835 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7836 Builder.resizeToMatch(Op1, Op2);
7837 VF = std::max(cast<VectorType>(Op1->
getType())
7839 .getKnownMinValue(),
7840 cast<VectorType>(Op2->
getType())
7842 .getKnownMinValue());
7843 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7846 "Expected undefined mask element");
7847 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7853 isa<ShuffleVectorInst>(Op1) &&
7854 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7856 return Builder.createIdentity(Op1);
7857 return Builder.createShuffleVector(
7861 if (isa<PoisonValue>(V1))
7862 return Builder.createPoison(
7863 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7865 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7866 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7869 return Builder.createShuffleVector(V1, NewMask);
7870 return Builder.createIdentity(V1);
7886 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7889 Mask, NumSrcElts, NumSubElts,
Index)) {
7890 if (
Index + NumSubElts > NumSrcElts &&
7891 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7901static std::pair<InstructionCost, InstructionCost>
7912 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7922 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7926 for (
Value *V : Ptrs) {
7931 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7936 if (!
Ptr || !
Ptr->hasOneUse())
7940 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7946 TTI::PointersChainInfo::getKnownStride(),
7956 [](
const Value *V) {
7957 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7958 return Ptr && !
Ptr->hasAllConstantIndices();
7960 ? TTI::PointersChainInfo::getUnknownStride()
7961 : TTI::PointersChainInfo::getKnownStride();
7965 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7967 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
7968 if (It != Ptrs.
end())
7969 BaseGEP = cast<GEPOperator>(*It);
7974 BaseGEP->getPointerOperand(), Indices, VecTy,
7979 return std::make_pair(ScalarCost, VecCost);
7984 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7985 TreeEntry &E = *TE.get();
7986 switch (E.getOpcode()) {
7987 case Instruction::Load: {
7990 if (E.State != TreeEntry::Vectorize)
7992 Type *ScalarTy = E.getMainOp()->getType();
7994 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8001 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8008 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8009 false, CommonAlignment,
CostKind, BaseLI);
8010 if (StridedCost < OriginalVecCost)
8013 E.State = TreeEntry::StridedVectorize;
8017 case Instruction::Store: {
8019 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8021 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8028 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8035 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8036 false, CommonAlignment,
CostKind, BaseSI);
8037 if (StridedCost < OriginalVecCost)
8040 E.State = TreeEntry::StridedVectorize;
8057 bool IsFinalized =
false;
8060 Type *ScalarTy =
nullptr;
8071 bool SameNodesEstimated =
true;
8080 if (
auto *VTy = dyn_cast<VectorType>(Ty))
8096 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8097 unsigned MinVF = R.getMinVF(2 * Sz);
8098 if (VL.
size() > 2 &&
8099 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8100 (InVectors.
empty() &&
8103 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8104 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8105 return S.getOpcode() == Instruction::Load &&
8108 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
8114 unsigned StartIdx = 0;
8115 unsigned VF = VL.
size() / 2;
8116 for (; VF >= MinVF; VF /= 2) {
8117 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
8120 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8122 if (SliceS.getOpcode() != Instruction::Load ||
8123 SliceS.isAltShuffle())
8131 CurrentOrder, PointerOps);
8141 CurrentOrder.
empty()) ||
8150 if (Cnt == StartIdx)
8159 if (StartIdx >= VL.
size())
8162 if (!VectorizedLoads.
empty())
8165 if (!VectorizedLoads.
empty()) {
8167 bool NeedInsertSubvectorAnalysis =
8168 !NumParts || (VL.
size() / VF) > NumParts;
8174 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
8181 for (
Value *V : VectorizedLoads) {
8182 auto *LI = cast<LoadInst>(V);
8189 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
8190 auto *LI = cast<LoadInst>(VL[
P.first]);
8199 false, Alignment, CostKind, LI);
8203 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8204 auto [ScalarGEPCost, VectorGEPCost] =
8206 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8207 GatherCost += VectorGEPCost - ScalarGEPCost;
8209 for (
unsigned P : ScatterVectorized) {
8210 auto *LI0 = cast<LoadInst>(VL[
P]);
8212 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8214 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8215 false, CommonAlignment, CostKind, LI0);
8219 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8227 auto [ScalarGEPCost, VectorGEPCost] =
8229 CostKind, ScalarTy, VecTy);
8230 GatherCost += VectorGEPCost - ScalarGEPCost;
8231 if (!Order.
empty()) {
8235 VecTy, Mask, CostKind);
8238 GatherCost += R.getGatherCost(PointerOps,
true,
8239 PointerOps.
front()->getType());
8242 if (NeedInsertSubvectorAnalysis) {
8245 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8246 for (
unsigned Idx : seq<unsigned>(0, E))
8249 ShuffleMask, CostKind,
I, LoadTy);
8252 GatherCost -= ScalarsCost;
8254 GatherCost = std::min(BaseCost, GatherCost);
8255 }
else if (!Root &&
isSplat(VL)) {
8258 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8259 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8262 count(VL, *It) > 1 &&
8266 CostKind, std::distance(VL.
begin(), It),
8271 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8277 VecTy, ShuffleMask, CostKind,
8282 (
all_of(Gathers, IsaPred<UndefValue>)
8284 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
8292 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8293 unsigned NumParts) {
8294 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8296 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8297 auto *EE = dyn_cast<ExtractElementInst>(V);
8300 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8303 return std::max(Sz, VecTy->getNumElements());
8309 -> std::optional<TTI::ShuffleKind> {
8310 if (NumElts <= EltsPerVector)
8311 return std::nullopt;
8313 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
8315 if (I == PoisonMaskElem)
8317 return std::min(S, I);
8320 int OffsetReg1 = OffsetReg0;
8324 int FirstRegId = -1;
8325 Indices.assign(1, OffsetReg0);
8329 int Idx =
I - OffsetReg0;
8331 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
8334 RegIndices.
insert(RegId);
8335 if (RegIndices.
size() > 2)
8336 return std::nullopt;
8337 if (RegIndices.
size() == 2) {
8339 if (Indices.size() == 1) {
8342 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
8344 if (I == PoisonMaskElem)
8346 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8347 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8348 if (RegId == FirstRegId)
8350 return std::min(S, I);
8353 Indices.push_back(OffsetReg1 % NumElts);
8355 Idx =
I - OffsetReg1;
8357 I = (
Idx % NumElts) % EltsPerVector +
8358 (RegId == FirstRegId ? 0 : EltsPerVector);
8367 for (
unsigned Part : seq<unsigned>(NumParts)) {
8368 if (!ShuffleKinds[Part])
8371 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
8375 std::optional<TTI::ShuffleKind> RegShuffleKind =
8376 CheckPerRegistersShuffle(SubMask, Indices);
8377 if (!RegShuffleKind) {
8380 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
8392 for (
unsigned Idx : Indices) {
8394 "SK_ExtractSubvector index out of range");
8398 std::nullopt, CostKind,
Idx,
8408 if (OriginalCost <
Cost)
8409 Cost = OriginalCost;
8417 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8424 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8426 unsigned SliceSize) {
8427 if (SameNodesEstimated) {
8433 if ((InVectors.
size() == 2 &&
8434 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8435 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8436 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8437 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
8440 "Expected all poisoned elements.");
8442 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8447 Cost += createShuffle(InVectors.
front(),
8448 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8450 transformMaskAfterShuffle(CommonMask, CommonMask);
8452 SameNodesEstimated =
false;
8453 if (!E2 && InVectors.
size() == 1) {
8454 unsigned VF = E1.getVectorFactor();
8457 cast<FixedVectorType>(V1->
getType())->getNumElements());
8459 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8460 VF = std::max(VF, E->getVectorFactor());
8462 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8464 CommonMask[
Idx] = Mask[
Idx] + VF;
8465 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8466 transformMaskAfterShuffle(CommonMask, CommonMask);
8468 Cost += createShuffle(&E1, E2, Mask);
8469 transformMaskAfterShuffle(CommonMask, Mask);
8473 class ShuffleCostBuilder {
8476 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8478 return Mask.empty() ||
8479 (VF == Mask.size() &&
8487 ~ShuffleCostBuilder() =
default;
8492 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8493 if (isEmptyOrIdentity(Mask, VF))
8496 cast<VectorType>(V1->
getType()), Mask);
8501 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8502 if (isEmptyOrIdentity(Mask, VF))
8505 cast<VectorType>(V1->
getType()), Mask);
8511 void resizeToMatch(
Value *&,
Value *&)
const {}
8521 ShuffleCostBuilder Builder(
TTI);
8524 unsigned CommonVF = Mask.size();
8526 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
8528 if (E.State == TreeEntry::NeedToGather &&
allConstant(E.Scalars))
8530 Type *EScalarTy = E.Scalars.front()->getType();
8531 bool IsSigned =
true;
8532 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8534 IsSigned = It->second.second;
8536 if (EScalarTy != ScalarTy) {
8537 unsigned CastOpcode = Instruction::Trunc;
8538 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8539 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8541 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8550 if (isa<Constant>(V))
8552 auto *VecTy = cast<VectorType>(V->getType());
8554 if (EScalarTy != ScalarTy) {
8556 unsigned CastOpcode = Instruction::Trunc;
8557 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8558 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8560 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8567 if (!V1 && !V2 && !P2.
isNull()) {
8569 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8570 unsigned VF = E->getVectorFactor();
8571 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8572 CommonVF = std::max(VF, E2->getVectorFactor());
8575 return Idx < 2 * static_cast<int>(CommonVF);
8577 "All elements in mask must be less than 2 * CommonVF.");
8578 if (E->Scalars.size() == E2->Scalars.size()) {
8582 for (
int &
Idx : CommonMask) {
8585 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8587 else if (
Idx >=
static_cast<int>(CommonVF))
8588 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8592 CommonVF = E->Scalars.size();
8593 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8594 GetNodeMinBWAffectedCost(*E2, CommonVF);
8596 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8597 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8601 }
else if (!V1 && P2.
isNull()) {
8603 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8604 unsigned VF = E->getVectorFactor();
8608 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8609 "All elements in mask must be less than CommonVF.");
8610 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8612 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8613 for (
int &
Idx : CommonMask) {
8617 CommonVF = E->Scalars.size();
8619 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8622 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8623 CommonVF == CommonMask.
size() &&
8625 [](
const auto &&
P) {
8627 static_cast<unsigned>(
P.value()) !=
P.index();
8635 }
else if (V1 && P2.
isNull()) {
8637 ExtraCost += GetValueMinBWAffectedCost(V1);
8638 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8641 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8642 "All elements in mask must be less than CommonVF.");
8643 }
else if (V1 && !V2) {
8645 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8646 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8647 CommonVF = std::max(VF, E2->getVectorFactor());
8650 return Idx < 2 * static_cast<int>(CommonVF);
8652 "All elements in mask must be less than 2 * CommonVF.");
8653 if (E2->Scalars.size() == VF && VF != CommonVF) {
8655 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8656 for (
int &
Idx : CommonMask) {
8659 if (
Idx >=
static_cast<int>(CommonVF))
8660 Idx = E2Mask[
Idx - CommonVF] + VF;
8664 ExtraCost += GetValueMinBWAffectedCost(V1);
8666 ExtraCost += GetNodeMinBWAffectedCost(
8667 *E2, std::min(CommonVF, E2->getVectorFactor()));
8669 }
else if (!V1 && V2) {
8671 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8672 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8673 CommonVF = std::max(VF, E1->getVectorFactor());
8676 return Idx < 2 * static_cast<int>(CommonVF);
8678 "All elements in mask must be less than 2 * CommonVF.");
8679 if (E1->Scalars.size() == VF && VF != CommonVF) {
8681 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8682 for (
int &
Idx : CommonMask) {
8685 if (
Idx >=
static_cast<int>(CommonVF))
8686 Idx = E1Mask[
Idx - CommonVF] + VF;
8692 ExtraCost += GetNodeMinBWAffectedCost(
8693 *E1, std::min(CommonVF, E1->getVectorFactor()));
8695 ExtraCost += GetValueMinBWAffectedCost(V2);
8698 assert(V1 && V2 &&
"Expected both vectors.");
8699 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8701 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8704 return Idx < 2 * static_cast<int>(CommonVF);
8706 "All elements in mask must be less than 2 * CommonVF.");
8708 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8709 if (V1->
getType() != V2->getType()) {
8713 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
8715 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8721 if (InVectors.
size() == 2)
8723 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8724 V1, V2, CommonMask, Builder);
8731 : ScalarTy(ScalarTy),
TTI(
TTI),
8732 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8733 CheckedExtracts(CheckedExtracts) {}
8735 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8736 unsigned NumParts,
bool &UseVecBaseAsInput) {
8737 UseVecBaseAsInput =
false;
8740 Value *VecBase =
nullptr;
8743 if (NumParts == VL.
size())
8747 bool PrevNodeFound =
any_of(
8749 [&](
const std::unique_ptr<TreeEntry> &TE) {
8750 return ((!TE->isAltShuffle() &&
8751 TE->getOpcode() == Instruction::ExtractElement) ||
8752 TE->State == TreeEntry::NeedToGather) &&
8753 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8754 return VL.size() > Data.index() &&
8755 (Mask[Data.index()] == PoisonMaskElem ||
8756 isa<UndefValue>(VL[Data.index()]) ||
8757 Data.value() == VL[Data.index()]);
8762 for (
unsigned Part : seq<unsigned>(NumParts)) {
8764 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
8767 if (isa<UndefValue>(V) ||
8776 auto *EE = cast<ExtractElementInst>(V);
8777 VecBase = EE->getVectorOperand();
8778 UniqueBases.
insert(VecBase);
8779 const TreeEntry *VE = R.getTreeEntry(V);
8780 if (!CheckedExtracts.
insert(V).second ||
8781 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8784 return isa<GetElementPtrInst>(U) &&
8785 !R.areAllUsersVectorized(cast<Instruction>(U),
8793 unsigned Idx = *EEIdx;
8795 if (EE->hasOneUse() || !PrevNodeFound) {
8797 if (isa<SExtInst, ZExtInst>(Ext) &&
8798 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8803 EE->getVectorOperandType(),
Idx);
8806 Ext->getOpcode(), Ext->getType(), EE->getType(),
8822 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8825 transformMaskAfterShuffle(CommonMask, CommonMask);
8826 SameNodesEstimated =
false;
8827 if (NumParts != 1 && UniqueBases.
size() != 1) {
8828 UseVecBaseAsInput =
true;
8836 std::optional<InstructionCost>
8840 return std::nullopt;
8846 return Idx < static_cast<int>(E1.getVectorFactor());
8848 "Expected single vector shuffle mask.");
8852 if (InVectors.
empty()) {
8853 CommonMask.
assign(Mask.begin(), Mask.end());
8854 InVectors.
assign({&E1, &E2});
8857 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8860 if (NumParts == 0 || NumParts >= Mask.size())
8865 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8866 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8869 if (InVectors.
empty()) {
8870 CommonMask.
assign(Mask.begin(), Mask.end());
8871 InVectors.
assign(1, &E1);
8874 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8877 if (NumParts == 0 || NumParts >= Mask.size())
8882 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8883 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8884 if (!SameNodesEstimated && InVectors.
size() == 1)
8897 cast<ExtractElementInst>(InVectors.
front()
8898 .get<
const TreeEntry *>()
8899 ->Scalars[
P.index()]);
8900 return EI->getVectorOperand() == V1 ||
8901 EI->getVectorOperand() == V2;
8903 "Expected extractelement vectors.");
8907 if (InVectors.
empty()) {
8909 "Expected empty input mask/vectors.");
8910 CommonMask.
assign(Mask.begin(), Mask.end());
8917 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8921 .get<const TreeEntry *>()
8922 ->Scalars[
P.index()];
8924 return P.value() == Mask[
P.index()] ||
8925 isa<UndefValue>(Scalar);
8926 if (isa<Constant>(V1))
8928 auto *EI = cast<ExtractElementInst>(Scalar);
8929 return EI->getVectorOperand() == V1;
8931 "Expected only tree entry for extractelement vectors.");
8935 "Expected only tree entries from extracts/reused buildvectors.");
8936 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8937 if (InVectors.
size() == 2) {
8938 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8939 transformMaskAfterShuffle(CommonMask, CommonMask);
8940 VF = std::max<unsigned>(VF, CommonMask.
size());
8941 }
else if (
const auto *InTE =
8942 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8943 VF = std::max(VF, InTE->getVectorFactor());
8947 ->getNumElements());
8950 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8952 CommonMask[
Idx] = Mask[
Idx] + VF;
8955 Value *Root =
nullptr) {
8956 Cost += getBuildVectorCost(VL, Root);
8960 unsigned VF = VL.
size();
8962 VF = std::min(VF, MaskVF);
8964 if (isa<UndefValue>(V)) {
8974 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8975 getAllOnesValue(*R.DL, ScalarTy));
8985 if (InVectors.
size() == 2)
8986 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8988 Cost += createShuffle(Vec,
nullptr, CommonMask);
8989 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8993 "Expected vector length for the final value before action.");
8995 Action(V, CommonMask);
8996 InVectors.
front() = V;
8999 if (CommonMask.
empty()) {
9000 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
9004 createShuffle(InVectors.
front(),
9005 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
9011 "Shuffle construction must be finalized.");
9015const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
9016 unsigned Idx)
const {
9018 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
9019 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9020 return EI.EdgeIdx == Idx && EI.UserTE == E;
9021 }) != TE->UserTreeIndices.end())
9023 auto MIt = MultiNodeScalars.
find(
Op);
9024 if (MIt != MultiNodeScalars.
end()) {
9025 for (
const TreeEntry *TE : MIt->second) {
9026 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9027 return EI.EdgeIdx == Idx && EI.UserTE == E;
9028 }) != TE->UserTreeIndices.end())
9034 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9035 return TE->State == TreeEntry::NeedToGather &&
9036 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
9037 return EI.EdgeIdx == Idx && EI.UserTE == E;
9038 }) !=
TE->UserTreeIndices.end();
9040 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
9045 if (
TE.State == TreeEntry::ScatterVectorize ||
9046 TE.State == TreeEntry::StridedVectorize)
9048 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
9049 !
TE.isAltShuffle()) {
9050 if (
TE.ReorderIndices.empty())
9089 Type *ScalarTy = VL[0]->getType();
9090 if (E->State != TreeEntry::NeedToGather) {
9091 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
9092 ScalarTy =
SI->getValueOperand()->getType();
9093 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
9095 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9096 ScalarTy =
IE->getOperand(1)->getType();
9104 auto It = MinBWs.
find(E);
9105 Type *OrigScalarTy = ScalarTy;
9106 if (It != MinBWs.
end())
9109 unsigned EntryVF = E->getVectorFactor();
9112 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9113 if (E->State == TreeEntry::NeedToGather) {
9116 if (isa<InsertElementInst>(VL[0]))
9118 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9119 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
9124 if (!E->ReorderIndices.empty() &&
9125 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9127 if (E->getOpcode() == Instruction::Store) {
9129 NewMask.
resize(E->ReorderIndices.size());
9130 copy(E->ReorderIndices, NewMask.
begin());
9136 if (NeedToShuffleReuses)
9137 ::addMask(Mask, E->ReuseShuffleIndices);
9141 assert((E->State == TreeEntry::Vectorize ||
9142 E->State == TreeEntry::ScatterVectorize ||
9143 E->State == TreeEntry::StridedVectorize) &&
9147 (E->getOpcode() == Instruction::GetElementPtr &&
9148 E->getMainOp()->getType()->isPointerTy())) &&
9151 unsigned ShuffleOrOp =
9152 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
9154 const unsigned Sz = UniqueValues.
size();
9156 for (
unsigned I = 0;
I < Sz; ++
I) {
9157 if (getTreeEntry(UniqueValues[
I]) == E)
9161 auto GetCastContextHint = [&](
Value *
V) {
9162 if (
const TreeEntry *OpTE = getTreeEntry(V))
9163 return getCastContextHint(*OpTE);
9164 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
9165 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9174 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
9178 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9180 for (
unsigned I = 0;
I < Sz; ++
I) {
9181 if (UsedScalars.test(
I))
9183 ScalarCost += ScalarEltCost(
I);
9191 const EdgeInfo &EI = E->UserTreeIndices.front();
9192 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9194 It != MinBWs.
end()) {
9195 auto UserBWIt = MinBWs.
find(EI.UserTE);
9196 Type *UserScalarTy =
9197 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9198 if (UserBWIt != MinBWs.
end())
9200 UserBWIt->second.first);
9201 if (ScalarTy != UserScalarTy) {
9202 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9203 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
9208 VecOpcode = Instruction::Trunc;
9211 It->second.second ? Instruction::SExt : Instruction::ZExt;
9218 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9219 ScalarCost,
"Calculated costs for Tree"));
9220 return VecCost - ScalarCost;
9225 assert((E->State == TreeEntry::Vectorize ||
9226 E->State == TreeEntry::StridedVectorize) &&
9227 "Entry state expected to be Vectorize or StridedVectorize here.");
9231 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
9232 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9233 "Calculated GEPs cost for Tree"));
9235 return VecCost - ScalarCost;
9238 switch (ShuffleOrOp) {
9239 case Instruction::PHI: {
9243 for (
Value *V : UniqueValues) {
9244 auto *
PHI = dyn_cast<PHINode>(V);
9249 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
9253 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
9255 if (!OpTE->ReuseShuffleIndices.empty())
9256 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9257 OpTE->Scalars.size());
9260 return CommonCost - ScalarCost;
9262 case Instruction::ExtractValue:
9263 case Instruction::ExtractElement: {
9264 auto GetScalarCost = [&](
unsigned Idx) {
9265 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
9267 if (ShuffleOrOp == Instruction::ExtractElement) {
9268 auto *EE = cast<ExtractElementInst>(
I);
9269 SrcVecTy = EE->getVectorOperandType();
9271 auto *EV = cast<ExtractValueInst>(
I);
9272 Type *AggregateTy = EV->getAggregateOperand()->getType();
9274 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9275 NumElts = ATy->getNumElements();
9280 if (
I->hasOneUse()) {
9282 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9283 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
9290 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9298 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9299 return GetCostDiff(GetScalarCost, GetVectorCost);
9301 case Instruction::InsertElement: {
9302 assert(E->ReuseShuffleIndices.empty() &&
9303 "Unique insertelements only are expected.");
9304 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9305 unsigned const NumElts = SrcVecTy->getNumElements();
9306 unsigned const NumScalars = VL.
size();
9312 unsigned OffsetEnd = OffsetBeg;
9313 InsertMask[OffsetBeg] = 0;
9316 if (OffsetBeg >
Idx)
9318 else if (OffsetEnd <
Idx)
9320 InsertMask[
Idx] =
I + 1;
9324 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9325 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9327 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9328 unsigned InsertVecSz = std::min<unsigned>(
9330 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9331 bool IsWholeSubvector =
9332 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9336 if (OffsetBeg + InsertVecSz > VecSz) {
9339 InsertVecSz = VecSz;
9345 if (!E->ReorderIndices.empty()) {
9350 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9352 bool IsIdentity =
true;
9354 Mask.swap(PrevMask);
9355 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9357 DemandedElts.
setBit(InsertIdx);
9358 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9359 Mask[InsertIdx - OffsetBeg] =
I;
9361 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9376 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9377 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9385 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9386 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9387 if (InsertVecSz != VecSz) {
9399 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9408 case Instruction::ZExt:
9409 case Instruction::SExt:
9410 case Instruction::FPToUI:
9411 case Instruction::FPToSI:
9412 case Instruction::FPExt:
9413 case Instruction::PtrToInt:
9414 case Instruction::IntToPtr:
9415 case Instruction::SIToFP:
9416 case Instruction::UIToFP:
9417 case Instruction::Trunc:
9418 case Instruction::FPTrunc:
9419 case Instruction::BitCast: {
9420 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9423 unsigned Opcode = ShuffleOrOp;
9424 unsigned VecOpcode = Opcode;
9426 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9428 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9429 if (SrcIt != MinBWs.
end()) {
9430 SrcBWSz = SrcIt->second.first;
9434 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9435 if (BWSz == SrcBWSz) {
9436 VecOpcode = Instruction::BitCast;
9437 }
else if (BWSz < SrcBWSz) {
9438 VecOpcode = Instruction::Trunc;
9439 }
else if (It != MinBWs.
end()) {
9440 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9441 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9442 }
else if (SrcIt != MinBWs.
end()) {
9443 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9445 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9447 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9448 !SrcIt->second.second) {
9449 VecOpcode = Instruction::UIToFP;
9452 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9460 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9462 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9466 VecOpcode == Opcode ? VI :
nullptr);
9468 return GetCostDiff(GetScalarCost, GetVectorCost);
9470 case Instruction::FCmp:
9471 case Instruction::ICmp:
9472 case Instruction::Select: {
9476 match(VL0, MatchCmp))
9482 auto GetScalarCost = [&](
unsigned Idx) {
9483 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9489 !
match(VI, MatchCmp)) ||
9490 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9496 Builder.getInt1Ty(), CurrentPred,
CostKind,
9503 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9515 if (IntrinsicAndUse.second)
9518 VecCost = std::min(VecCost, IntrinsicCost);
9520 return VecCost + CommonCost;
9522 return GetCostDiff(GetScalarCost, GetVectorCost);
9524 case Instruction::FNeg:
9525 case Instruction::Add:
9526 case Instruction::FAdd:
9527 case Instruction::Sub:
9528 case Instruction::FSub:
9529 case Instruction::Mul:
9530 case Instruction::FMul:
9531 case Instruction::UDiv:
9532 case Instruction::SDiv:
9533 case Instruction::FDiv:
9534 case Instruction::URem:
9535 case Instruction::SRem:
9536 case Instruction::FRem:
9537 case Instruction::Shl:
9538 case Instruction::LShr:
9539 case Instruction::AShr:
9540 case Instruction::And:
9541 case Instruction::Or:
9542 case Instruction::Xor: {
9543 auto GetScalarCost = [&](
unsigned Idx) {
9544 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9545 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9554 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
9555 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9558 auto *CI = dyn_cast<ConstantInt>(
Op);
9559 return CI && CI->getValue().countr_one() >= It->second.first;
9564 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9568 Op2Info, std::nullopt,
nullptr, TLI) +
9571 return GetCostDiff(GetScalarCost, GetVectorCost);
9573 case Instruction::GetElementPtr: {
9574 return CommonCost + GetGEPCostDiff(VL, VL0);
9576 case Instruction::Load: {
9577 auto GetScalarCost = [&](
unsigned Idx) {
9578 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9580 VI->getAlign(),
VI->getPointerAddressSpace(),
9583 auto *LI0 = cast<LoadInst>(VL0);
9586 if (E->State == TreeEntry::Vectorize) {
9588 Instruction::Load, VecTy, LI0->getAlign(),
9590 }
else if (E->State == TreeEntry::StridedVectorize) {
9591 Align CommonAlignment =
9592 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9594 Instruction::Load, VecTy, LI0->getPointerOperand(),
9597 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9598 Align CommonAlignment =
9599 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9601 Instruction::Load, VecTy, LI0->getPointerOperand(),
9604 return VecLdCost + CommonCost;
9610 if (E->State == TreeEntry::ScatterVectorize)
9616 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9617 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9619 case Instruction::Store: {
9620 bool IsReorder = !E->ReorderIndices.empty();
9621 auto GetScalarCost = [=](
unsigned Idx) {
9622 auto *
VI = cast<StoreInst>(VL[
Idx]);
9625 VI->getAlign(),
VI->getPointerAddressSpace(),
9629 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9633 if (E->State == TreeEntry::StridedVectorize) {
9634 Align CommonAlignment =
9635 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9637 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9640 assert(E->State == TreeEntry::Vectorize &&
9641 "Expected either strided or consecutive stores.");
9644 Instruction::Store, VecTy, BaseSI->getAlign(),
9645 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
9647 return VecStCost + CommonCost;
9651 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9652 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9655 return GetCostDiff(GetScalarCost, GetVectorCost) +
9656 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9658 case Instruction::Call: {
9659 auto GetScalarCost = [&](
unsigned Idx) {
9660 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9671 auto *CI = cast<CallInst>(VL0);
9675 It != MinBWs.
end() ? It->second.first : 0);
9677 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9679 return GetCostDiff(GetScalarCost, GetVectorCost);
9681 case Instruction::ShuffleVector: {
9682 assert(E->isAltShuffle() &&
9687 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9688 "Invalid Shuffle Vector Operand");
9691 auto TryFindNodeWithEqualOperands = [=]() {
9692 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9695 if (
TE->isAltShuffle() &&
9696 ((
TE->getOpcode() == E->getOpcode() &&
9697 TE->getAltOpcode() == E->getAltOpcode()) ||
9698 (
TE->getOpcode() == E->getAltOpcode() &&
9699 TE->getAltOpcode() == E->getOpcode())) &&
9700 TE->hasEqualOperands(*E))
9705 auto GetScalarCost = [&](
unsigned Idx) {
9706 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9707 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9717 if (TryFindNodeWithEqualOperands()) {
9719 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9726 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9728 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9729 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9731 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9732 CI0->getPredicate(),
CostKind, VL0);
9733 VecCost += TTIRef.getCmpSelInstrCost(
9734 E->getOpcode(), VecTy, MaskTy,
9735 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9738 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9741 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9742 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9744 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9745 if (SrcIt != MinBWs.
end()) {
9746 SrcBWSz = SrcIt->second.first;
9750 if (BWSz <= SrcBWSz) {
9753 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9757 <<
"SLP: alternate extension, which should be truncated.\n";
9763 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9766 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9770 E->buildAltOpShuffleMask(
9772 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9773 return I->getOpcode() == E->getAltOpcode();
9782 unsigned Opcode0 = E->getOpcode();
9783 unsigned Opcode1 = E->getAltOpcode();
9787 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9789 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9790 return AltVecCost < VecCost ? AltVecCost : VecCost;
9795 return GetCostDiff(GetScalarCost, GetVectorCost);
9802bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9804 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9806 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9808 return TE->State == TreeEntry::NeedToGather &&
9810 [
this](
Value *V) { return EphValues.contains(V); }) &&
9812 TE->Scalars.size() < Limit ||
9813 ((
TE->getOpcode() == Instruction::ExtractElement ||
9814 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9816 (
TE->State == TreeEntry::NeedToGather &&
9817 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9821 if (VectorizableTree.size() == 1 &&
9822 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9824 AreVectorizableGathers(VectorizableTree[0].
get(),
9825 VectorizableTree[0]->Scalars.size()) &&
9826 VectorizableTree[0]->getVectorFactor() > 2)))
9829 if (VectorizableTree.size() != 2)
9837 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9838 AreVectorizableGathers(VectorizableTree[1].
get(),
9839 VectorizableTree[0]->Scalars.size()))
9843 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9844 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9845 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9846 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9854 bool MustMatchOrInst) {
9858 Value *ZextLoad = Root;
9859 const APInt *ShAmtC;
9860 bool FoundOr =
false;
9861 while (!isa<ConstantExpr>(ZextLoad) &&
9864 ShAmtC->
urem(8) == 0))) {
9865 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9866 ZextLoad = BinOp->getOperand(0);
9867 if (BinOp->getOpcode() == Instruction::Or)
9872 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9879 Type *SrcTy = Load->getType();
9886 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9887 << *(cast<Instruction>(Root)) <<
"\n");
9896 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9897 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9905 unsigned NumElts = Stores.
size();
9906 for (
Value *Scalar : Stores) {
9917 if (VectorizableTree.size() == 2 &&
9918 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9919 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9920 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9921 !(
isSplat(VectorizableTree[1]->Scalars) ||
9929 constexpr int Limit = 4;
9931 !VectorizableTree.empty() &&
9932 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9933 return (TE->State == TreeEntry::NeedToGather &&
9934 TE->getOpcode() != Instruction::ExtractElement &&
9935 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9936 TE->getOpcode() == Instruction::PHI;
9947 if (isFullyVectorizableTinyTree(ForReduction))
9952 bool IsAllowedSingleBVNode =
9953 VectorizableTree.size() > 1 ||
9954 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9955 !VectorizableTree.front()->isAltShuffle() &&
9956 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9957 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9959 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9960 return TE->State == TreeEntry::NeedToGather &&
9962 return isa<ExtractElementInst, UndefValue>(V) ||
9963 (IsAllowedSingleBVNode &&
9964 !V->hasNUsesOrMore(UsesLimit) &&
9965 any_of(V->users(), IsaPred<InsertElementInst>));
9970 assert(VectorizableTree.empty()
9971 ? ExternalUses.empty()
9972 :
true &&
"We shouldn't have any external users");
9984 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9997 for (
const auto &TEPtr : VectorizableTree) {
9998 if (TEPtr->State != TreeEntry::Vectorize)
10000 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
10006 auto *NodeA = DT->
getNode(
A->getParent());
10007 auto *NodeB = DT->
getNode(
B->getParent());
10008 assert(NodeA &&
"Should only process reachable instructions");
10009 assert(NodeB &&
"Should only process reachable instructions");
10010 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10011 "Different nodes should have different DFS numbers");
10012 if (NodeA != NodeB)
10013 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10014 return B->comesBefore(
A);
10024 LiveValues.
erase(PrevInst);
10025 for (
auto &J : PrevInst->
operands()) {
10026 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10027 LiveValues.
insert(cast<Instruction>(&*J));
10031 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
10032 for (
auto *
X : LiveValues)
10033 dbgs() <<
" " <<
X->getName();
10034 dbgs() <<
", Looking at ";
10039 unsigned NumCalls = 0;
10043 while (InstIt != PrevInstIt) {
10044 if (PrevInstIt == PrevInst->
getParent()->rend()) {
10045 PrevInstIt = Inst->getParent()->rbegin();
10050 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
10051 if (
II->isAssumeLikeIntrinsic())
10055 for (
auto &ArgOp :
II->args())
10057 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
10058 FMF = FPMO->getFastMathFlags();
10065 if (IntrCost < CallCost)
10072 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10073 &*PrevInstIt != PrevInst)
10081 for (
auto *
II : LiveValues) {
10082 auto *ScalarTy =
II->getType();
10083 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10084 ScalarTy = VectorTy->getElementType();
10102 const auto *I1 = IE1;
10103 const auto *I2 = IE2;
10115 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10117 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10118 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
10120 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10121 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10128struct ValueSelect {
10129 template <
typename U>
10130 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
10133 template <
typename U>
10134 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
10152template <
typename T>
10158 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
10160 auto VMIt = std::next(ShuffleMask.begin());
10163 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10165 if (!IsBaseUndef.
all()) {
10167 std::pair<T *, bool> Res =
10168 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
10170 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
10174 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
10176 auto *V = ValueSelect::get<T *>(
Base);
10178 assert((!V || GetVF(V) == Mask.size()) &&
10179 "Expected base vector of VF number of elements.");
10180 Prev = Action(Mask, {
nullptr, Res.first});
10181 }
else if (ShuffleMask.size() == 1) {
10184 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10190 Prev = Action(Mask, {ShuffleMask.begin()->first});
10194 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10195 unsigned Vec2VF = GetVF(VMIt->first);
10196 if (Vec1VF == Vec2VF) {
10200 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10203 Mask[
I] = SecMask[
I] + Vec1VF;
10206 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10209 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10211 std::pair<T *, bool> Res2 =
10212 ResizeAction(VMIt->first, VMIt->second,
false);
10214 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10221 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
10224 Prev = Action(Mask, {Res1.first, Res2.first});
10226 VMIt = std::next(VMIt);
10228 bool IsBaseNotUndef = !IsBaseUndef.
all();
10229 (void)IsBaseNotUndef;
10231 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10233 std::pair<T *, bool> Res =
10234 ResizeAction(VMIt->first, VMIt->second,
false);
10236 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
10239 "Multiple uses of scalars.");
10240 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
10245 Prev = Action(Mask, {Prev, Res.first});
10253 << VectorizableTree.size() <<
".\n");
10255 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10258 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
10259 TreeEntry &TE = *VectorizableTree[
I];
10260 if (TE.State == TreeEntry::NeedToGather) {
10261 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
10262 E && E->getVectorFactor() == TE.getVectorFactor() &&
10263 E->isSame(TE.Scalars)) {
10268 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10277 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10287 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10288 for (ExternalUser &EU : ExternalUses) {
10290 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10291 !ExtractCostCalculated.
insert(EU.Scalar).second)
10297 if (EphValues.
count(EU.User))
10301 if (isa<FixedVectorType>(EU.Scalar->getType()))
10306 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
10308 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
10309 if (!UsedInserts.
insert(VU).second)
10313 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10316 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10318 VU, cast<InsertElementInst>(Pair.first),
10320 Value *Op0 = II->getOperand(0);
10321 if (getTreeEntry(II) && !getTreeEntry(Op0))
10327 if (It == FirstUsers.
end()) {
10334 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10335 if (IEBase != EU.User &&
10336 (!IEBase->hasOneUse() ||
10340 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10343 IEBase = cast<InsertElementInst>(
Base);
10346 "InsertElementInstruction used already.");
10348 Base = IEBase->getOperand(0);
10349 }
while (E == getTreeEntry(
Base));
10352 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10356 VecId = FirstUsers.
size() - 1;
10357 auto It = MinBWs.
find(ScalarTE);
10358 if (It != MinBWs.
end() &&
10360 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10362 unsigned BWSz = It->second.first;
10363 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10364 unsigned VecOpcode;
10365 if (DstBWSz < BWSz)
10366 VecOpcode = Instruction::Trunc;
10369 It->second.second ? Instruction::SExt : Instruction::ZExt;
10375 FTy->getNumElements()),
10378 <<
" for extending externally used vector with "
10379 "non-equal minimum bitwidth.\n");
10385 VecId = std::distance(FirstUsers.
begin(), It);
10387 int InIdx = *InsertIdx;
10391 Mask[InIdx] = EU.Lane;
10392 DemandedElts[VecId].setBit(InIdx);
10400 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10401 if (!ValueToExtUses) {
10402 ValueToExtUses.emplace();
10404 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10410 if (!getTreeEntry(V))
10412 auto It = ValueToExtUses->find(V);
10413 if (It != ValueToExtUses->end()) {
10415 ExternalUses[It->second].User = nullptr;
10420 if (CanBeUsedAsGEP) {
10422 ExternalUsesAsGEPs.
insert(EU.Scalar);
10431 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10432 if (It != MinBWs.
end()) {
10435 It->second.second ? Instruction::SExt : Instruction::ZExt;
10445 if (!VectorizedVals.
empty()) {
10446 const TreeEntry &Root = *VectorizableTree.front().get();
10447 auto BWIt = MinBWs.find(&Root);
10448 if (BWIt != MinBWs.end()) {
10449 Type *DstTy = Root.Scalars.front()->getType();
10450 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10452 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10453 if (OriginalSz != SrcSz) {
10454 unsigned Opcode = Instruction::Trunc;
10455 if (OriginalSz > SrcSz)
10456 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10466 Cost += SpillCost + ExtractCost;
10470 unsigned VF =
Mask.size();
10471 unsigned VecVF =
TE->getVectorFactor();
10473 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10476 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10482 dbgs() <<
"SLP: Adding cost " <<
C
10483 <<
" for final shuffle of insertelement external users.\n";
10484 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10486 return std::make_pair(TE,
true);
10488 return std::make_pair(TE,
false);
10491 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10492 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10493 auto Vector = ShuffleMasks[
I].takeVector();
10497 assert((TEs.size() == 1 || TEs.size() == 2) &&
10498 "Expected exactly 1 or 2 tree entries.");
10499 if (TEs.size() == 1) {
10501 VF = TEs.front()->getVectorFactor();
10507 (
Data.index() < VF &&
10508 static_cast<int>(
Data.index()) ==
Data.value());
10513 <<
" for final shuffle of insertelement "
10514 "external users.\n";
10515 TEs.front()->
dump();
10516 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10522 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10523 VF = TEs.front()->getVectorFactor();
10532 <<
" for final shuffle of vector node and external "
10533 "insertelement users.\n";
10534 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10535 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10541 (void)performExtractsShuffleAction<const TreeEntry>(
10543 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10544 EstimateShufflesCost);
10546 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10548 Cost -= InsertCost;
10552 if (ReductionBitWidth != 0) {
10553 assert(UserIgnoreList &&
"Expected reduction tree.");
10554 const TreeEntry &E = *VectorizableTree.front().get();
10555 auto It = MinBWs.find(&E);
10556 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10557 unsigned SrcSize = It->second.first;
10558 unsigned DstSize = ReductionBitWidth;
10559 unsigned Opcode = Instruction::Trunc;
10560 if (SrcSize < DstSize)
10561 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10568 switch (E.getOpcode()) {
10569 case Instruction::SExt:
10570 case Instruction::ZExt:
10571 case Instruction::Trunc: {
10572 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10573 CCH = getCastContextHint(*OpTE);
10583 <<
" for final resize for reduction from " << SrcVecTy
10584 <<
" to " << DstVecTy <<
"\n";
10585 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10593 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10594 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10595 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10599 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10610std::optional<TTI::ShuffleKind>
10611BoUpSLP::tryToGatherSingleRegisterExtractElements(
10617 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10618 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10620 if (isa<UndefValue>(VL[
I]))
10624 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10625 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10634 ExtractMask.reset(*
Idx);
10639 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10643 for (
const auto &
Data : VectorOpToIdx)
10644 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
10645 .push_back(
Data.first);
10646 for (
auto &
Data : VFToVector) {
10648 return VectorOpToIdx.find(V1)->second.size() >
10649 VectorOpToIdx.find(V2)->second.size();
10654 const int UndefSz = UndefVectorExtracts.
size();
10655 unsigned SingleMax = 0;
10656 Value *SingleVec =
nullptr;
10657 unsigned PairMax = 0;
10658 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
10659 for (
auto &
Data : VFToVector) {
10661 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
10662 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10666 if (
Data.second.size() > 1)
10667 V2 = *std::next(
Data.second.begin());
10668 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
10670 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
10671 PairVec = std::make_pair(V1, V2);
10674 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10675 return std::nullopt;
10681 if (SingleMax >= PairMax && SingleMax) {
10682 for (
int Idx : VectorOpToIdx[SingleVec])
10685 for (
Value *V : {PairVec.first, PairVec.second})
10686 for (
int Idx : VectorOpToIdx[V])
10690 for (
int Idx : UndefVectorExtracts)
10694 std::optional<TTI::ShuffleKind> Res =
10700 return std::nullopt;
10704 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10705 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10706 isa<UndefValue>(GatheredExtracts[
I])) {
10710 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10711 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10712 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10727 unsigned NumParts)
const {
10728 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10732 for (
unsigned Part : seq<unsigned>(NumParts)) {
10738 std::optional<TTI::ShuffleKind> Res =
10739 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10740 ShufflesRes[Part] = Res;
10741 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10743 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10744 return Res.has_value();
10746 ShufflesRes.clear();
10747 return ShufflesRes;
10750std::optional<TargetTransformInfo::ShuffleKind>
10751BoUpSLP::isGatherShuffledSingleRegisterEntry(
10757 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10758 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10762 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10763 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10766 TEInsertBlock = TEInsertPt->
getParent();
10769 return std::nullopt;
10770 auto *NodeUI = DT->
getNode(TEInsertBlock);
10771 assert(NodeUI &&
"Should only process reachable instructions");
10773 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10787 auto *NodeEUI = DT->
getNode(InsertBlock);
10790 assert((NodeUI == NodeEUI) ==
10791 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10792 "Different nodes should have different DFS numbers");
10794 if (TEInsertPt->
getParent() != InsertBlock &&
10797 if (TEInsertPt->
getParent() == InsertBlock &&
10811 for (
Value *V : VL) {
10816 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10820 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10821 "Must contain at least single gathered value.");
10822 assert(TEPtr->UserTreeIndices.size() == 1 &&
10823 "Expected only single user of a gather node.");
10824 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10826 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10829 : &getLastInstructionInBundle(UseEI.UserTE);
10830 if (TEInsertPt == InsertPt) {
10834 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10838 if (TEUseEI.UserTE != UseEI.UserTE &&
10839 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10845 if ((TEInsertBlock != InsertPt->
getParent() ||
10846 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10847 !CheckOrdering(InsertPt))
10851 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10853 if (VTE->State != TreeEntry::Vectorize) {
10854 auto It = MultiNodeScalars.
find(V);
10855 if (It == MultiNodeScalars.
end())
10857 VTE = *It->getSecond().begin();
10859 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10860 return MTE->State == TreeEntry::Vectorize;
10862 if (MIt == It->getSecond().end())
10867 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10868 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10872 if (VToTEs.
empty())
10874 if (UsedTEs.
empty()) {
10888 if (!VToTEs.
empty()) {
10894 VToTEs = SavedVToTEs;
10903 if (UsedTEs.
size() == 2)
10905 UsedTEs.push_back(SavedVToTEs);
10912 if (UsedTEs.
empty()) {
10914 return std::nullopt;
10918 if (UsedTEs.
size() == 1) {
10921 UsedTEs.front().
end());
10922 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10923 return TE1->Idx < TE2->Idx;
10926 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10927 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10929 if (It != FirstEntries.end() &&
10930 ((*It)->getVectorFactor() == VL.size() ||
10931 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10932 TE->ReuseShuffleIndices.size() == VL.size() &&
10933 (*It)->isSame(
TE->Scalars)))) {
10934 Entries.push_back(*It);
10935 if ((*It)->getVectorFactor() == VL.size()) {
10936 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10937 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10943 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10944 if (isa<PoisonValue>(VL[
I]))
10950 Entries.push_back(FirstEntries.front());
10953 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10956 for (
const TreeEntry *TE : UsedTEs.front()) {
10957 unsigned VF =
TE->getVectorFactor();
10958 auto It = VFToTE.
find(VF);
10959 if (It != VFToTE.
end()) {
10960 if (It->second->Idx >
TE->Idx)
10961 It->getSecond() =
TE;
10968 UsedTEs.back().
end());
10969 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10970 return TE1->Idx < TE2->Idx;
10972 for (
const TreeEntry *TE : SecondEntries) {
10973 auto It = VFToTE.
find(
TE->getVectorFactor());
10974 if (It != VFToTE.
end()) {
10976 Entries.push_back(It->second);
10977 Entries.push_back(TE);
10983 if (Entries.empty()) {
10985 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10986 return TE1->Idx < TE2->Idx;
10988 Entries.push_back(SecondEntries.front());
10989 VF = std::max(Entries.front()->getVectorFactor(),
10990 Entries.back()->getVectorFactor());
10994 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
10997 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10998 auto *
PHI = cast<PHINode>(V);
10999 auto *PHI1 = cast<PHINode>(V1);
11004 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
11006 Value *In1 = PHI1->getIncomingValue(
I);
11011 if (cast<Instruction>(In)->
getParent() !=
11021 auto MightBeIgnored = [=](
Value *
V) {
11022 auto *
I = dyn_cast<Instruction>(V);
11023 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
11025 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
11030 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
11032 bool UsedInSameVTE =
false;
11033 auto It = UsedValuesEntry.
find(V1);
11034 if (It != UsedValuesEntry.
end())
11035 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
11036 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11038 cast<Instruction>(V)->getParent() ==
11039 cast<Instruction>(V1)->getParent() &&
11040 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11045 for (
int I = 0, E = VL.size();
I < E; ++
I) {
11047 auto It = UsedValuesEntry.
find(V);
11048 if (It == UsedValuesEntry.
end())
11054 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
11055 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
11057 unsigned Idx = It->second;
11064 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
11065 if (!UsedIdxs.test(
I))
11071 for (std::pair<unsigned, int> &Pair : EntryLanes)
11072 if (Pair.first ==
I)
11073 Pair.first = TempEntries.
size();
11076 Entries.swap(TempEntries);
11077 if (EntryLanes.size() == Entries.size() &&
11079 .
slice(Part * VL.size(),
11080 std::min<int>(VL.size(),
TE->Scalars.size())))) {
11086 return std::nullopt;
11089 bool IsIdentity = Entries.size() == 1;
11092 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
11093 unsigned Idx = Part * VL.size() + Pair.second;
11096 (ForOrder ? std::distance(
11097 Entries[Pair.first]->Scalars.begin(),
11098 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11099 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11100 IsIdentity &=
Mask[
Idx] == Pair.second;
11102 switch (Entries.size()) {
11104 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11108 if (EntryLanes.size() > 2 || VL.size() <= 2)
11116 std::fill(std::next(
Mask.begin(), Part * VL.size()),
11118 return std::nullopt;
11122BoUpSLP::isGatherShuffledEntry(
11126 assert(NumParts > 0 && NumParts < VL.
size() &&
11127 "Expected positive number of registers.");
11130 if (TE == VectorizableTree.front().get())
11133 if (
TE->isNonPowOf2Vec())
11136 assert(
TE->UserTreeIndices.size() == 1 &&
11137 "Expected only single user of the gather node.");
11139 "Number of scalars must be divisible by NumParts.");
11142 for (
unsigned Part : seq<unsigned>(NumParts)) {
11146 std::optional<TTI::ShuffleKind> SubRes =
11147 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11150 SubEntries.
clear();
11153 SubEntries.
front()->getVectorFactor() == VL.
size() &&
11154 (SubEntries.
front()->isSame(
TE->Scalars) ||
11155 SubEntries.
front()->isSame(VL))) {
11157 LocalSubEntries.
swap(SubEntries);
11160 std::iota(
Mask.begin(),
Mask.end(), 0);
11162 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
11163 if (isa<PoisonValue>(VL[
I]))
11165 Entries.emplace_back(1, LocalSubEntries.
front());
11171 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
11179 Type *ScalarTy)
const {
11181 bool DuplicateNonConst =
false;
11189 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
11190 if (
V->getType() != ScalarTy) {
11201 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
11204 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
11212 EstimateInsertCost(
I, V);
11213 ShuffleMask[
I] =
I;
11217 DuplicateNonConst =
true;
11219 ShuffleMask[
I] = Res.first->second;
11225 if (DuplicateNonConst)
11227 VecTy, ShuffleMask);
11239 VLOperands Ops(VL, R);
11242 Left = Ops.getVL(0);
11243 Right = Ops.getVL(1);
11246Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
11249 return *Res.second;
11253 auto *Front = E->getMainOp();
11256 if (E->getOpcode() == Instruction::GetElementPtr &&
11257 !isa<GetElementPtrInst>(V))
11259 auto *I = cast<Instruction>(V);
11260 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11261 isVectorLikeInstWithConstOps(I);
11264 auto FindLastInst = [&]() {
11266 for (
Value *V : E->Scalars) {
11267 auto *
I = dyn_cast<Instruction>(V);
11270 if (LastInst->
getParent() ==
I->getParent()) {
11275 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11276 !isa<GetElementPtrInst>(
I)) ||
11279 "Expected vector-like or non-GEP in GEP node insts only.");
11287 auto *NodeB = DT->
getNode(
I->getParent());
11288 assert(NodeA &&
"Should only process reachable instructions");
11289 assert(NodeB &&
"Should only process reachable instructions");
11290 assert((NodeA == NodeB) ==
11291 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11292 "Different nodes should have different DFS numbers");
11293 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11300 auto FindFirstInst = [&]() {
11302 for (
Value *V : E->Scalars) {
11303 auto *
I = dyn_cast<Instruction>(V);
11306 if (FirstInst->
getParent() ==
I->getParent()) {
11307 if (
I->comesBefore(FirstInst))
11311 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11312 !isa<GetElementPtrInst>(
I)) ||
11315 "Expected vector-like or non-GEP in GEP node insts only.");
11323 auto *NodeB = DT->
getNode(
I->getParent());
11324 assert(NodeA &&
"Should only process reachable instructions");
11325 assert(NodeB &&
"Should only process reachable instructions");
11326 assert((NodeA == NodeB) ==
11327 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11328 "Different nodes should have different DFS numbers");
11329 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11338 (E->State != TreeEntry::NeedToGather &&
11340 if ((E->getOpcode() == Instruction::GetElementPtr &&
11343 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11347 return !isVectorLikeInstWithConstOps(V) &&
11348 isUsedOutsideBlock(V);
11350 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11352 return isa<ExtractElementInst, UndefValue>(V) ||
11353 areAllOperandsNonInsts(V);
11355 Res.second = FindLastInst();
11357 Res.second = FindFirstInst();
11358 return *Res.second;
11365 if (BlocksSchedules.count(BB)) {
11366 Value *
V = E->isOneOf(E->Scalars.back());
11369 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11370 if (Bundle && Bundle->isPartOfBundle())
11371 for (; Bundle; Bundle = Bundle->NextInBundle)
11372 if (Bundle->OpValue == Bundle->Inst)
11373 Res.second = Bundle->Inst;
11395 Res.second = FindLastInst();
11396 assert(Res.second &&
"Failed to find last instruction in bundle");
11397 return *Res.second;
11400void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11401 auto *Front = E->getMainOp();
11402 Instruction *LastInst = &getLastInstructionInBundle(E);
11403 assert(LastInst &&
"Failed to find last instruction in bundle");
11406 bool IsPHI = isa<PHINode>(LastInst);
11408 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
11409 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11411 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11415 Builder.SetInsertPoint(
11419 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11429 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11432 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11433 InsertBB = InsertBB->getSinglePredecessor();
11434 return InsertBB && InsertBB == InstBB;
11436 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11437 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11438 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11439 getTreeEntry(Inst) ||
11440 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11441 PostponedIndices.
insert(
I).second)
11445 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11448 if (
Scalar->getType() != Ty) {
11450 "Expected integer types only.");
11452 if (
auto *CI = dyn_cast<CastInst>(Scalar);
11453 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11455 if (
auto *IOp = dyn_cast<Instruction>(
Op);
11456 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
11459 Scalar = Builder.CreateIntCast(
11463 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11464 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11467 GatherShuffleExtractSeq.
insert(InsElt);
11468 CSEBlocks.
insert(InsElt->getParent());
11470 if (isa<Instruction>(V)) {
11471 if (TreeEntry *Entry = getTreeEntry(V)) {
11473 User *UserOp =
nullptr;
11475 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11481 unsigned FoundLane =
Entry->findLaneForValue(V);
11482 ExternalUses.emplace_back(V, UserOp, FoundLane);
11492 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11500 if (!isa<UndefValue>(VL[
I])) {
11504 if (isa<PoisonValue>(VL[
I]))
11506 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11511 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11514 for (
int I : NonConsts)
11515 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11518 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11519 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11557 bool IsFinalized =
false;
11567 Type *ScalarTy =
nullptr;
11571 class ShuffleIRBuilder {
11584 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11585 CSEBlocks(CSEBlocks),
DL(
DL) {}
11586 ~ShuffleIRBuilder() =
default;
11589 if (V1->
getType() != V2->getType()) {
11592 "Expected integer vector types only.");
11593 if (V1->
getType() != V2->getType()) {
11594 if (cast<VectorType>(V2->getType())
11596 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11598 ->getIntegerBitWidth())
11607 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11608 GatherShuffleExtractSeq.
insert(
I);
11609 CSEBlocks.
insert(
I->getParent());
11618 unsigned VF = Mask.size();
11619 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11623 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11624 GatherShuffleExtractSeq.
insert(
I);
11625 CSEBlocks.
insert(
I->getParent());
11629 Value *createIdentity(
Value *V) {
return V; }
11630 Value *createPoison(
Type *Ty,
unsigned VF) {
11635 void resizeToMatch(
Value *&V1,
Value *&V2) {
11636 if (V1->
getType() == V2->getType())
11638 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11639 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11640 int VF = std::max(V1VF, V2VF);
11641 int MinVF = std::min(V1VF, V2VF);
11643 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11645 Value *&
Op = MinVF == V1VF ? V1 : V2;
11647 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11648 GatherShuffleExtractSeq.
insert(
I);
11649 CSEBlocks.
insert(
I->getParent());
11662 assert(V1 &&
"Expected at least one vector value.");
11663 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11664 R.CSEBlocks, *R.DL);
11665 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11673 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11681 std::optional<bool> IsSigned = std::nullopt) {
11682 auto *VecTy = cast<VectorType>(V->getType());
11692 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11696 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11697 unsigned NumParts,
bool &UseVecBaseAsInput) {
11698 UseVecBaseAsInput =
false;
11700 Value *VecBase =
nullptr;
11701 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11705 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11706 VecBase = EI->getVectorOperand();
11707 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11708 VecBase = TE->VectorizedValue;
11709 assert(VecBase &&
"Expected vectorized value.");
11710 UniqueBases.
insert(VecBase);
11713 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11715 const TreeEntry *UTE = R.getTreeEntry(U);
11716 return !UTE || R.MultiNodeScalars.contains(U) ||
11717 (isa<GetElementPtrInst>(U) &&
11718 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11719 count_if(R.VectorizableTree,
11720 [&](const std::unique_ptr<TreeEntry> &TE) {
11721 return any_of(TE->UserTreeIndices,
11722 [&](const EdgeInfo &Edge) {
11723 return Edge.UserTE == UTE;
11725 is_contained(TE->Scalars, EI);
11729 R.eraseInstruction(EI);
11731 if (NumParts == 1 || UniqueBases.
size() == 1) {
11732 assert(VecBase &&
"Expected vectorized value.");
11733 return castToScalarTyElem(VecBase);
11735 UseVecBaseAsInput =
true;
11745 Value *Vec =
nullptr;
11748 for (
unsigned Part : seq<unsigned>(NumParts)) {
11749 unsigned Limit =
getNumElems(E->Scalars.size(), SliceSize, Part);
11753 constexpr int MaxBases = 2;
11761 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11762 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11763 VecOp = TE->VectorizedValue;
11764 assert(VecOp &&
"Expected vectorized value.");
11766 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
11768 assert((PrevSize ==
Size || PrevSize == 0) &&
11769 "Expected vectors of the same size.");
11772 VecOp = castToScalarTyElem(VecOp);
11773 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
11775 if (!Bases.front())
11778 if (Bases.back()) {
11779 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11780 TransformToIdentity(SubMask);
11782 SubVec = Bases.front();
11789 Mask.slice(
P * SliceSize,
11796 "Expected first part or all previous parts masked.");
11797 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11799 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11801 unsigned SubVecVF =
11802 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11803 VF = std::max(VF, SubVecVF);
11806 for (
int &
Idx : SubMask)
11809 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11810 Vec = createShuffle(Vec, SubVec, VecMask);
11811 TransformToIdentity(VecMask);
11819 std::optional<Value *>
11825 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
11827 return std::nullopt;
11839 Value *V1 = E1.VectorizedValue;
11841 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
11842 return !isKnownNonNegative(
11843 V, SimplifyQuery(*R.DL));
11845 Value *V2 = E2.VectorizedValue;
11846 if (V2->getType()->isIntOrIntVectorTy())
11847 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
11848 return !isKnownNonNegative(
11849 V, SimplifyQuery(*R.DL));
11856 Value *V1 = E1.VectorizedValue;
11858 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
11859 return !isKnownNonNegative(
11860 V, SimplifyQuery(*R.DL));
11866 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
11867 V1 = castToScalarTyElem(V1);
11868 V2 = castToScalarTyElem(V2);
11869 if (InVectors.
empty()) {
11872 CommonMask.
assign(Mask.begin(), Mask.end());
11876 if (InVectors.
size() == 2) {
11877 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11878 transformMaskAfterShuffle(CommonMask, CommonMask);
11879 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
11881 Vec = createShuffle(Vec,
nullptr, CommonMask);
11882 transformMaskAfterShuffle(CommonMask, CommonMask);
11884 V1 = createShuffle(V1, V2, Mask);
11885 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11887 CommonMask[
Idx] =
Idx + Sz;
11888 InVectors.
front() = Vec;
11889 if (InVectors.
size() == 2)
11890 InVectors.
back() = V1;
11896 V1 = castToScalarTyElem(V1);
11897 if (InVectors.
empty()) {
11898 if (!isa<FixedVectorType>(V1->
getType())) {
11899 V1 = createShuffle(V1,
nullptr, CommonMask);
11901 transformMaskAfterShuffle(CommonMask, Mask);
11904 CommonMask.
assign(Mask.begin(), Mask.end());
11907 const auto *It =
find(InVectors, V1);
11908 if (It == InVectors.
end()) {
11909 if (InVectors.
size() == 2 ||
11911 !isa<FixedVectorType>(V1->
getType())) {
11913 if (InVectors.
size() == 2) {
11914 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11915 transformMaskAfterShuffle(CommonMask, CommonMask);
11916 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11917 CommonMask.
size()) {
11918 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11919 transformMaskAfterShuffle(CommonMask, CommonMask);
11921 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11924 V->getType() != V1->
getType()
11926 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11927 ->getNumElements();
11928 if (V->getType() != V1->
getType())
11929 V1 = createShuffle(V1,
nullptr, Mask);
11930 InVectors.
front() = V;
11931 if (InVectors.
size() == 2)
11932 InVectors.
back() = V1;
11939 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11945 int VF = CommonMask.
size();
11946 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11947 VF = FTy->getNumElements();
11948 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11950 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11959 Value *Root =
nullptr) {
11960 return R.gather(VL, Root, ScalarTy);
11969 IsFinalized =
true;
11972 if (InVectors.
size() == 2) {
11973 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11976 Vec = createShuffle(Vec,
nullptr, CommonMask);
11978 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11982 "Expected vector length for the final value before action.");
11983 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11986 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11987 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11989 Action(Vec, CommonMask);
11990 InVectors.
front() = Vec;
11992 if (!ExtMask.
empty()) {
11993 if (CommonMask.
empty()) {
11997 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
12000 NewMask[
I] = CommonMask[ExtMask[
I]];
12002 CommonMask.
swap(NewMask);
12005 if (CommonMask.
empty()) {
12006 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
12007 return InVectors.
front();
12009 if (InVectors.
size() == 2)
12010 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
12011 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
12016 "Shuffle construction must be finalized.");
12020Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
12021 bool PostponedPHIs) {
12022 ValueList &VL = E->getOperand(NodeIdx);
12023 const unsigned VF = VL.size();
12026 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12027 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
12028 if (It != VL.end())
12031 if (S.getOpcode()) {
12032 auto CheckSameVE = [&](
const TreeEntry *VE) {
12033 return VE->isSame(VL) &&
12034 (
any_of(VE->UserTreeIndices,
12035 [E, NodeIdx](
const EdgeInfo &EI) {
12036 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12038 any_of(VectorizableTree,
12039 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
12040 return TE->isOperandGatherNode({E, NodeIdx}) &&
12041 VE->isSame(TE->Scalars);
12044 TreeEntry *VE = getTreeEntry(S.OpValue);
12045 bool IsSameVE = VE && CheckSameVE(VE);
12047 auto It = MultiNodeScalars.
find(S.OpValue);
12048 if (It != MultiNodeScalars.
end()) {
12049 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
12050 return TE != VE && CheckSameVE(TE);
12052 if (
I != It->getSecond().end()) {
12060 ShuffleInstructionBuilder ShuffleBuilder(
12061 cast<VectorType>(
V->getType())->getElementType(), Builder, *
this);
12062 ShuffleBuilder.add(V, Mask);
12063 return ShuffleBuilder.finalize(std::nullopt);
12066 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
12067 if (!VE->ReuseShuffleIndices.empty()) {
12088 if (isa<PoisonValue>(V))
12090 Mask[
I] = VE->findLaneForValue(V);
12092 V = FinalShuffle(V, Mask);
12094 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
12095 "Expected vectorization factor less "
12096 "than original vector size.");
12098 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12099 V = FinalShuffle(V, UniformMask);
12105 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
12106 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12107 }) == VE->UserTreeIndices.end()) {
12109 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12110 return TE->State == TreeEntry::NeedToGather &&
12111 TE->UserTreeIndices.front().UserTE == E &&
12112 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12114 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
12115 (*It)->VectorizedValue =
V;
12124 auto *
I =
find_if(VectorizableTree,
12125 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
12126 return TE->isOperandGatherNode({E, NodeIdx});
12128 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
12129 assert(
I->get()->UserTreeIndices.size() == 1 &&
12130 "Expected only single user for the gather node.");
12131 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
12135template <
typename BVTy,
typename ResTy,
typename...
Args>
12136ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
12138 assert(E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
12139 unsigned VF = E->getVectorFactor();
12141 bool NeedFreeze =
false;
12143 E->ReuseShuffleIndices.end());
12149 if (!ReorderMask.
empty())
12152 unsigned I,
unsigned SliceSize) {
12154 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12157 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12158 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12159 if (UserTE->getNumOperands() != 2)
12162 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
12163 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
12164 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12165 }) !=
TE->UserTreeIndices.end();
12167 if (It == VectorizableTree.end())
12170 if ((
Mask.size() < InputVF &&
12173 (
Mask.size() == InputVF &&
12176 std::next(
Mask.begin(),
I * SliceSize),
12177 std::next(
Mask.begin(),
12184 std::next(
Mask.begin(),
I * SliceSize),
12185 std::next(
Mask.begin(),
12191 BVTy ShuffleBuilder(ScalarTy, Params...);
12192 ResTy Res = ResTy();
12196 Value *ExtractVecBase =
nullptr;
12197 bool UseVecBaseAsInput =
false;
12200 Type *OrigScalarTy = GatheredScalars.front()->getType();
12203 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12205 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
12207 bool Resized =
false;
12209 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12210 if (!ExtractShuffles.
empty()) {
12215 if (
const auto *TE = getTreeEntry(
12216 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
12219 if (std::optional<ResTy> Delayed =
12220 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12222 PostponedGathers.
insert(E);
12227 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
12228 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12229 ExtractVecBase = VecBase;
12230 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12231 if (VF == VecBaseTy->getNumElements() &&
12232 GatheredScalars.size() != VF) {
12234 GatheredScalars.append(VF - GatheredScalars.size(),
12240 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
12241 E->isAltShuffle() ||
12242 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
12244 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12246 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12248 if (!GatherShuffles.
empty()) {
12249 if (std::optional<ResTy> Delayed =
12250 ShuffleBuilder.needToDelay(E, Entries)) {
12252 PostponedGathers.
insert(E);
12257 if (GatherShuffles.
size() == 1 &&
12259 Entries.front().front()->isSame(E->Scalars)) {
12264 <<
"SLP: perfect diamond match for gather bundle "
12267 Mask.resize(E->Scalars.size());
12268 const TreeEntry *FrontTE = Entries.front().front();
12269 if (FrontTE->ReorderIndices.empty() &&
12270 ((FrontTE->ReuseShuffleIndices.empty() &&
12271 E->Scalars.size() == FrontTE->Scalars.size()) ||
12272 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12273 std::iota(
Mask.begin(),
Mask.end(), 0);
12276 if (isa<PoisonValue>(V)) {
12280 Mask[
I] = FrontTE->findLaneForValue(V);
12283 ShuffleBuilder.add(*FrontTE, Mask);
12284 Res = ShuffleBuilder.finalize(E->getCommonMask());
12288 if (GatheredScalars.size() != VF &&
12290 return any_of(TEs, [&](
const TreeEntry *TE) {
12291 return TE->getVectorFactor() == VF;
12294 GatheredScalars.append(VF - GatheredScalars.size(),
12298 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
12306 bool IsRootPoison) {
12309 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
12316 int NumNonConsts = 0;
12319 if (isa<UndefValue>(V)) {
12320 if (!isa<PoisonValue>(V)) {
12335 Scalars.
front() = OrigV;
12338 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
12339 Scalars[Res.first->second] = OrigV;
12340 ReuseMask[
I] = Res.first->second;
12343 if (NumNonConsts == 1) {
12348 if (!UndefPos.
empty() && UndefPos.
front() == 0)
12351 ReuseMask[SinglePos] = SinglePos;
12352 }
else if (!UndefPos.
empty() && IsSplat) {
12357 return !isa<UndefValue>(V) &&
12359 (E->UserTreeIndices.size() == 1 &&
12363 return E->UserTreeIndices.front().EdgeIdx !=
12364 U.getOperandNo() &&
12366 E->UserTreeIndices.front().UserTE->Scalars,
12370 if (It != Scalars.
end()) {
12372 int Pos = std::distance(Scalars.
begin(), It);
12373 for (
int I : UndefPos) {
12375 ReuseMask[
I] = Pos;
12384 for (
int I : UndefPos) {
12386 if (isa<UndefValue>(Scalars[
I]))
12393 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12394 bool IsNonPoisoned =
true;
12395 bool IsUsedInExpr =
true;
12396 Value *Vec1 =
nullptr;
12397 if (!ExtractShuffles.
empty()) {
12401 Value *Vec2 =
nullptr;
12402 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12406 if (UseVecBaseAsInput) {
12407 Vec1 = ExtractVecBase;
12409 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12412 if (isa<UndefValue>(E->Scalars[
I]))
12414 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12415 Value *VecOp = EI->getVectorOperand();
12416 if (
const auto *TE = getTreeEntry(VecOp))
12417 if (
TE->VectorizedValue)
12418 VecOp =
TE->VectorizedValue;
12421 }
else if (Vec1 != VecOp) {
12422 assert((!Vec2 || Vec2 == VecOp) &&
12423 "Expected only 1 or 2 vectors shuffle.");
12429 IsUsedInExpr =
false;
12432 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12434 IsUsedInExpr &= FindReusedSplat(
12436 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12437 ExtractMask.size());
12438 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12441 IsUsedInExpr =
false;
12446 if (!GatherShuffles.
empty()) {
12449 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12452 "No shuffles with empty entries list expected.");
12456 "Expected shuffle of 1 or 2 entries.");
12460 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12461 if (TEs.
size() == 1) {
12462 IsUsedInExpr &= FindReusedSplat(
12463 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12464 ShuffleBuilder.add(*TEs.
front(), VecMask);
12465 if (TEs.
front()->VectorizedValue)
12469 IsUsedInExpr =
false;
12470 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12471 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12482 int EMSz = ExtractMask.size();
12483 int MSz =
Mask.size();
12486 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12487 bool IsIdentityShuffle =
12488 ((UseVecBaseAsInput ||
12490 [](
const std::optional<TTI::ShuffleKind> &SK) {
12494 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12496 (!GatherShuffles.
empty() &&
12498 [](
const std::optional<TTI::ShuffleKind> &SK) {
12502 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12504 bool EnoughConstsForShuffle =
12508 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12512 return isa<Constant>(V) && !isa<UndefValue>(V);
12514 (!IsIdentityShuffle ||
12515 (GatheredScalars.size() == 2 &&
12517 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12519 return isa<Constant>(V) && !isa<PoisonValue>(V);
12523 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12524 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12530 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12532 TryPackScalars(GatheredScalars, BVMask,
true);
12533 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12534 ShuffleBuilder.add(BV, BVMask);
12537 return isa<PoisonValue>(V) ||
12538 (IsSingleShuffle && ((IsIdentityShuffle &&
12539 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12541 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12543 Res = ShuffleBuilder.finalize(
12544 E->ReuseShuffleIndices, E->Scalars.size(),
12546 TryPackScalars(NonConstants, Mask,
false);
12547 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12552 TryPackScalars(GatheredScalars, ReuseMask,
true);
12553 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12554 ShuffleBuilder.add(BV, ReuseMask);
12555 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12560 if (!isa<PoisonValue>(V))
12563 Value *BV = ShuffleBuilder.gather(E->Scalars);
12564 ShuffleBuilder.add(BV, Mask);
12565 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12569 Res = ShuffleBuilder.createFreeze(Res);
12573Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
12574 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12581 if (E->VectorizedValue &&
12582 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12583 E->isAltShuffle())) {
12584 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12585 return E->VectorizedValue;
12588 Value *
V = E->Scalars.front();
12589 Type *ScalarTy =
V->getType();
12590 if (
auto *Store = dyn_cast<StoreInst>(V))
12591 ScalarTy =
Store->getValueOperand()->getType();
12592 else if (
auto *IE = dyn_cast<InsertElementInst>(V))
12593 ScalarTy =
IE->getOperand(1)->getType();
12594 auto It = MinBWs.
find(E);
12595 if (It != MinBWs.
end())
12598 if (E->State == TreeEntry::NeedToGather) {
12600 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12601 setInsertPointAfterBundle(E);
12602 Value *Vec = createBuildVector(E, ScalarTy);
12603 E->VectorizedValue = Vec;
12608 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12609 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
12610 if (E->getOpcode() == Instruction::Store &&
12611 E->State == TreeEntry::Vectorize) {
12613 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12614 E->ReorderIndices.size());
12615 ShuffleBuilder.add(V, Mask);
12616 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12617 ShuffleBuilder.addOrdered(V, std::nullopt);
12619 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12621 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12624 assert((E->State == TreeEntry::Vectorize ||
12625 E->State == TreeEntry::ScatterVectorize ||
12626 E->State == TreeEntry::StridedVectorize) &&
12627 "Unhandled state");
12628 unsigned ShuffleOrOp =
12629 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12631 auto GetOperandSignedness = [&](
unsigned Idx) {
12632 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12633 bool IsSigned =
false;
12634 auto It = MinBWs.
find(OpE);
12635 if (It != MinBWs.
end())
12636 IsSigned = It->second.second;
12639 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12643 switch (ShuffleOrOp) {
12644 case Instruction::PHI: {
12645 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12646 E != VectorizableTree.front().get() ||
12647 !E->UserTreeIndices.empty()) &&
12648 "PHI reordering is free.");
12649 if (PostponedPHIs && E->VectorizedValue)
12650 return E->VectorizedValue;
12651 auto *PH = cast<PHINode>(VL0);
12653 PH->getParent()->getFirstNonPHIIt());
12655 if (PostponedPHIs || !E->VectorizedValue) {
12662 PH->getParent()->getFirstInsertionPt());
12665 V = FinalShuffle(V, E, VecTy);
12667 E->VectorizedValue =
V;
12671 PHINode *NewPhi = cast<PHINode>(E->PHI);
12680 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12686 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12690 if (!VisitedBBs.
insert(IBB).second) {
12697 Value *Vec = vectorizeOperand(E,
I,
true);
12698 if (VecTy != Vec->
getType()) {
12700 getOperandEntry(E,
I)->State == TreeEntry::NeedToGather ||
12701 MinBWs.
contains(getOperandEntry(E,
I))) &&
12702 "Expected item in MinBWs.");
12703 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12709 "Invalid number of incoming values");
12713 case Instruction::ExtractElement: {
12714 Value *
V = E->getSingleOperand(0);
12715 if (
const TreeEntry *TE = getTreeEntry(V))
12716 V =
TE->VectorizedValue;
12717 setInsertPointAfterBundle(E);
12718 V = FinalShuffle(V, E, VecTy);
12719 E->VectorizedValue =
V;
12722 case Instruction::ExtractValue: {
12723 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12728 NewV = FinalShuffle(NewV, E, VecTy);
12729 E->VectorizedValue = NewV;
12732 case Instruction::InsertElement: {
12733 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12735 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12737 Type *ScalarTy =
Op.front()->getType();
12738 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12740 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12741 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12746 cast<FixedVectorType>(
V->getType())->getNumElements()),
12751 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12752 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12754 const unsigned NumElts =
12755 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12756 const unsigned NumScalars = E->Scalars.size();
12759 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12763 if (!E->ReorderIndices.empty()) {
12768 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12771 bool IsIdentity =
true;
12773 Mask.swap(PrevMask);
12774 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12777 IsIdentity &= InsertIdx -
Offset ==
I;
12780 if (!IsIdentity || NumElts != NumScalars) {
12784 if (NumElts != NumScalars &&
Offset == 0) {
12793 InsertMask[*InsertIdx] = *InsertIdx;
12794 if (!
Ins->hasOneUse())
12796 Ins = dyn_cast_or_null<InsertElementInst>(
12797 Ins->getUniqueUndroppableUser());
12800 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12802 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12805 if (!IsFirstPoison.
all()) {
12807 for (
unsigned I = 0;
I < NumElts;
I++) {
12809 IsFirstUndef.
test(
I)) {
12810 if (IsVNonPoisonous) {
12811 InsertMask[
I] =
I < NumScalars ?
I : 0;
12816 if (
Idx >= NumScalars)
12817 Idx = NumScalars - 1;
12818 InsertMask[
I] = NumScalars +
Idx;
12832 if (
auto *
I = dyn_cast<Instruction>(V)) {
12833 GatherShuffleExtractSeq.
insert(
I);
12834 CSEBlocks.
insert(
I->getParent());
12839 for (
unsigned I = 0;
I < NumElts;
I++) {
12844 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12847 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
12848 NumElts != NumScalars) {
12849 if (IsFirstUndef.
all()) {
12852 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12853 if (!IsFirstPoison.
all()) {
12854 for (
unsigned I = 0;
I < NumElts;
I++) {
12856 InsertMask[
I] =
I + NumElts;
12863 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
12864 if (
auto *
I = dyn_cast<Instruction>(V)) {
12865 GatherShuffleExtractSeq.
insert(
I);
12866 CSEBlocks.
insert(
I->getParent());
12871 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12872 for (
unsigned I = 0;
I < NumElts;
I++) {
12876 InsertMask[
I] += NumElts;
12879 FirstInsert->getOperand(0), V, InsertMask,
12880 cast<Instruction>(E->Scalars.back())->getName());
12881 if (
auto *
I = dyn_cast<Instruction>(V)) {
12882 GatherShuffleExtractSeq.
insert(
I);
12883 CSEBlocks.
insert(
I->getParent());
12888 ++NumVectorInstructions;
12889 E->VectorizedValue =
V;
12892 case Instruction::ZExt:
12893 case Instruction::SExt:
12894 case Instruction::FPToUI:
12895 case Instruction::FPToSI:
12896 case Instruction::FPExt:
12897 case Instruction::PtrToInt:
12898 case Instruction::IntToPtr:
12899 case Instruction::SIToFP:
12900 case Instruction::UIToFP:
12901 case Instruction::Trunc:
12902 case Instruction::FPTrunc:
12903 case Instruction::BitCast: {
12904 setInsertPointAfterBundle(E);
12906 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12907 if (E->VectorizedValue) {
12908 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12909 return E->VectorizedValue;
12912 auto *CI = cast<CastInst>(VL0);
12914 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
12915 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
12917 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
12920 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12921 if (SrcIt != MinBWs.
end())
12922 SrcBWSz = SrcIt->second.first;
12923 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12924 if (BWSz == SrcBWSz) {
12925 VecOpcode = Instruction::BitCast;
12926 }
else if (BWSz < SrcBWSz) {
12927 VecOpcode = Instruction::Trunc;
12928 }
else if (It != MinBWs.
end()) {
12929 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12930 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12931 }
else if (SrcIt != MinBWs.
end()) {
12932 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12934 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12936 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
12937 !SrcIt->second.second) {
12938 VecOpcode = Instruction::UIToFP;
12940 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12942 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12943 V = FinalShuffle(V, E, VecTy);
12945 E->VectorizedValue =
V;
12946 ++NumVectorInstructions;
12949 case Instruction::FCmp:
12950 case Instruction::ICmp: {
12951 setInsertPointAfterBundle(E);
12953 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
12954 if (E->VectorizedValue) {
12955 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12956 return E->VectorizedValue;
12958 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
12959 if (E->VectorizedValue) {
12960 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12961 return E->VectorizedValue;
12963 if (
L->getType() !=
R->getType()) {
12964 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12965 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12966 MinBWs.
contains(getOperandEntry(E, 0)) ||
12967 MinBWs.
contains(getOperandEntry(E, 1))) &&
12968 "Expected item in MinBWs.");
12969 if (cast<VectorType>(
L->getType())
12971 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
12973 ->getIntegerBitWidth()) {
12974 Type *CastTy =
R->getType();
12977 Type *CastTy =
L->getType();
12986 VecTy = cast<FixedVectorType>(
V->getType());
12987 V = FinalShuffle(V, E, VecTy);
12989 E->VectorizedValue =
V;
12990 ++NumVectorInstructions;
12993 case Instruction::Select: {
12994 setInsertPointAfterBundle(E);
12996 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
12997 if (E->VectorizedValue) {
12998 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12999 return E->VectorizedValue;
13001 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
13002 if (E->VectorizedValue) {
13003 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13004 return E->VectorizedValue;
13006 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
13007 if (E->VectorizedValue) {
13008 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13009 return E->VectorizedValue;
13013 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13014 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
13015 MinBWs.
contains(getOperandEntry(E, 1)) ||
13016 MinBWs.
contains(getOperandEntry(E, 2))) &&
13017 "Expected item in MinBWs.");
13018 if (True->
getType() != VecTy)
13019 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
13020 if (False->
getType() != VecTy)
13021 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
13025 V = FinalShuffle(V, E, VecTy);
13027 E->VectorizedValue =
V;
13028 ++NumVectorInstructions;
13031 case Instruction::FNeg: {
13032 setInsertPointAfterBundle(E);
13034 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
13036 if (E->VectorizedValue) {
13037 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13038 return E->VectorizedValue;
13044 if (
auto *
I = dyn_cast<Instruction>(V))
13047 V = FinalShuffle(V, E, VecTy);
13049 E->VectorizedValue =
V;
13050 ++NumVectorInstructions;
13054 case Instruction::Add:
13055 case Instruction::FAdd:
13056 case Instruction::Sub:
13057 case Instruction::FSub:
13058 case Instruction::Mul:
13059 case Instruction::FMul:
13060 case Instruction::UDiv:
13061 case Instruction::SDiv:
13062 case Instruction::FDiv:
13063 case Instruction::URem:
13064 case Instruction::SRem:
13065 case Instruction::FRem:
13066 case Instruction::Shl:
13067 case Instruction::LShr:
13068 case Instruction::AShr:
13069 case Instruction::And:
13070 case Instruction::Or:
13071 case Instruction::Xor: {
13072 setInsertPointAfterBundle(E);
13074 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
13075 if (E->VectorizedValue) {
13076 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13077 return E->VectorizedValue;
13079 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
13080 if (E->VectorizedValue) {
13081 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13082 return E->VectorizedValue;
13084 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
13085 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13088 auto *CI = dyn_cast<ConstantInt>(
Op);
13089 return CI && CI->getValue().countr_one() >= It->second.first;
13091 V = FinalShuffle(
I == 0 ? RHS : LHS, E, VecTy);
13092 E->VectorizedValue =
V;
13093 ++NumVectorInstructions;
13100 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13101 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13102 MinBWs.
contains(getOperandEntry(E, 0)) ||
13103 MinBWs.
contains(getOperandEntry(E, 1))) &&
13104 "Expected item in MinBWs.");
13115 if (
auto *
I = dyn_cast<Instruction>(V)) {
13118 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
13120 return isCommutative(cast<Instruction>(V));
13122 I->setHasNoUnsignedWrap(
false);
13125 V = FinalShuffle(V, E, VecTy);
13127 E->VectorizedValue =
V;
13128 ++NumVectorInstructions;
13132 case Instruction::Load: {
13135 setInsertPointAfterBundle(E);
13137 LoadInst *LI = cast<LoadInst>(VL0);
13140 if (E->State == TreeEntry::Vectorize) {
13142 }
else if (E->State == TreeEntry::StridedVectorize) {
13143 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13144 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13145 PO = IsReverseOrder ? PtrN : Ptr0;
13151 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
13153 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13154 DL->getTypeAllocSize(ScalarTy));
13158 return cast<LoadInst>(V)->getPointerOperand();
13161 std::optional<Value *> Stride =
13170 (IsReverseOrder ? -1 : 1) *
13171 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
13173 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13175 Intrinsic::experimental_vp_strided_load,
13176 {VecTy, PO->
getType(), StrideTy},
13178 Builder.
getInt32(E->Scalars.size())});
13184 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
13185 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13186 if (E->VectorizedValue) {
13187 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13188 return E->VectorizedValue;
13191 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13196 V = FinalShuffle(V, E, VecTy);
13197 E->VectorizedValue =
V;
13198 ++NumVectorInstructions;
13201 case Instruction::Store: {
13202 auto *
SI = cast<StoreInst>(VL0);
13204 setInsertPointAfterBundle(E);
13206 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13207 if (VecValue->
getType() != VecTy)
13209 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13210 VecValue = FinalShuffle(VecValue, E, VecTy);
13214 if (E->State == TreeEntry::Vectorize) {
13217 assert(E->State == TreeEntry::StridedVectorize &&
13218 "Expected either strided or conseutive stores.");
13219 if (!E->ReorderIndices.empty()) {
13220 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13221 Ptr =
SI->getPointerOperand();
13223 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13224 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
13226 Intrinsic::experimental_vp_strided_store,
13227 {VecTy,
Ptr->getType(), StrideTy},
13230 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
13232 Builder.
getInt32(E->Scalars.size())});
13241 E->VectorizedValue =
V;
13242 ++NumVectorInstructions;
13245 case Instruction::GetElementPtr: {
13246 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13247 setInsertPointAfterBundle(E);
13249 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13250 if (E->VectorizedValue) {
13251 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13252 return E->VectorizedValue;
13256 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
13257 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13258 if (E->VectorizedValue) {
13259 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13260 return E->VectorizedValue;
13265 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13266 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
13268 for (
Value *V : E->Scalars) {
13269 if (isa<GetElementPtrInst>(V))
13275 V = FinalShuffle(V, E, VecTy);
13277 E->VectorizedValue =
V;
13278 ++NumVectorInstructions;
13282 case Instruction::Call: {
13283 CallInst *CI = cast<CallInst>(VL0);
13284 setInsertPointAfterBundle(E);
13290 It != MinBWs.
end() ? It->second.first : 0);
13293 VecCallCosts.first <= VecCallCosts.second;
13295 Value *ScalarArg =
nullptr;
13301 auto *CEI = cast<CallInst>(VL0);
13302 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
13307 ScalarArg = CEI->getArgOperand(
I);
13310 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
13311 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
13319 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
13320 if (E->VectorizedValue) {
13321 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13322 return E->VectorizedValue;
13324 ScalarArg = CEI->getArgOperand(
I);
13325 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
13327 It == MinBWs.
end()) {
13330 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
13331 }
else if (It != MinBWs.
end()) {
13332 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
13341 if (!UseIntrinsic) {
13357 V = FinalShuffle(V, E, VecTy);
13359 E->VectorizedValue =
V;
13360 ++NumVectorInstructions;
13363 case Instruction::ShuffleVector: {
13364 assert(E->isAltShuffle() &&
13369 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13370 "Invalid Shuffle Vector Operand");
13374 setInsertPointAfterBundle(E);
13375 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13376 if (E->VectorizedValue) {
13377 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13378 return E->VectorizedValue;
13380 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13382 setInsertPointAfterBundle(E);
13383 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13385 if (E->VectorizedValue) {
13386 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13387 return E->VectorizedValue;
13394 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13395 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13396 MinBWs.
contains(getOperandEntry(E, 0)) ||
13397 MinBWs.
contains(getOperandEntry(E, 1))) &&
13398 "Expected item in MinBWs.");
13399 Type *CastTy = VecTy;
13403 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
13405 ->getIntegerBitWidth())
13422 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13423 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13424 auto *AltCI = cast<CmpInst>(E->getAltOp());
13426 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13429 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13430 cast<VectorType>(
LHS->
getType())->getElementType());
13431 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13432 if (BWSz <= SrcBWSz) {
13433 if (BWSz < SrcBWSz)
13436 if (
auto *
I = dyn_cast<Instruction>(LHS))
13438 E->VectorizedValue =
LHS;
13439 ++NumVectorInstructions;
13450 for (
Value *V : {V0, V1}) {
13451 if (
auto *
I = dyn_cast<Instruction>(V)) {
13452 GatherShuffleExtractSeq.
insert(
I);
13453 CSEBlocks.
insert(
I->getParent());
13462 E->buildAltOpShuffleMask(
13464 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13468 Mask, &OpScalars, &AltScalars);
13472 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13474 if (
auto *
I = dyn_cast<Instruction>(Vec);
13475 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13477 auto *IV = cast<Instruction>(V);
13478 return IV->getOpcode() == Instruction::Sub &&
13479 isCommutative(cast<Instruction>(IV));
13481 I->setHasNoUnsignedWrap(
false);
13483 DropNuwFlag(V0, E->getOpcode());
13484 DropNuwFlag(V1, E->getAltOpcode());
13487 if (
auto *
I = dyn_cast<Instruction>(V)) {
13489 GatherShuffleExtractSeq.
insert(
I);
13490 CSEBlocks.
insert(
I->getParent());
13493 E->VectorizedValue =
V;
13494 ++NumVectorInstructions;
13507 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13513struct ShuffledInsertData {
13526 for (
auto &BSIter : BlocksSchedules) {
13527 scheduleBlock(BSIter.second.get());
13531 EntryToLastInstruction.
clear();
13541 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13542 if (TE->State == TreeEntry::Vectorize &&
13543 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13544 TE->VectorizedValue)
13550 for (
const TreeEntry *E : PostponedNodes) {
13551 auto *TE =
const_cast<TreeEntry *
>(E);
13552 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13553 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13554 TE->UserTreeIndices.front().EdgeIdx)) &&
13555 VecTE->isSame(TE->Scalars))
13559 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13560 TE->VectorizedValue =
nullptr;
13562 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13571 if (isa<PHINode>(UserI)) {
13574 for (
User *U : PrevVec->users()) {
13577 auto *UI = dyn_cast<Instruction>(U);
13578 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13580 if (UI->comesBefore(InsertPt))
13589 if (Vec->
getType() != PrevVec->getType()) {
13591 PrevVec->getType()->isIntOrIntVectorTy() &&
13592 "Expected integer vector types only.");
13593 std::optional<bool> IsSigned;
13594 for (
Value *V : TE->Scalars) {
13595 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
13596 auto It = MinBWs.
find(BaseTE);
13597 if (It != MinBWs.
end()) {
13598 IsSigned = IsSigned.value_or(
false) || It->second.second;
13602 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
13603 auto It = MinBWs.
find(MNTE);
13604 if (It != MinBWs.
end()) {
13605 IsSigned = IsSigned.value_or(
false) || It->second.second;
13610 if (IsSigned.value_or(
false))
13613 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13614 auto It = MinBWs.
find(BVE);
13615 if (It != MinBWs.
end()) {
13616 IsSigned = IsSigned.value_or(
false) || It->second.second;
13621 if (IsSigned.value_or(
false))
13623 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
13625 IsSigned.value_or(
false) ||
13629 if (IsSigned.value_or(
false))
13633 if (IsSigned.value_or(
false)) {
13635 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
13636 if (It != MinBWs.
end())
13637 IsSigned = It->second.second;
13640 "Expected user node or perfect diamond match in MinBWs.");
13644 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13647 auto It = PostponedValues.
find(PrevVec);
13648 if (It != PostponedValues.
end()) {
13649 for (TreeEntry *VTE : It->getSecond())
13650 VTE->VectorizedValue = Vec;
13670 for (
const auto &ExternalUse : ExternalUses) {
13671 Value *Scalar = ExternalUse.Scalar;
13678 TreeEntry *E = getTreeEntry(Scalar);
13679 assert(E &&
"Invalid scalar");
13680 assert(E->State != TreeEntry::NeedToGather &&
13681 "Extracting from a gather list");
13683 if (E->getOpcode() == Instruction::GetElementPtr &&
13684 !isa<GetElementPtrInst>(Scalar))
13687 Value *Vec = E->VectorizedValue;
13688 assert(Vec &&
"Can't find vectorizable value");
13691 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13692 if (Scalar->getType() != Vec->
getType()) {
13693 Value *Ex =
nullptr;
13694 Value *ExV =
nullptr;
13695 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13697 auto It = ScalarToEEs.find(Scalar);
13698 if (It != ScalarToEEs.end()) {
13702 if (EEIt != It->second.end()) {
13708 if (
auto *CI = EEIt->second.second)
13712 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13717 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13718 Value *V = ES->getVectorOperand();
13719 if (
const TreeEntry *ETE = getTreeEntry(V))
13720 V = ETE->VectorizedValue;
13722 }
else if (ReplaceGEP) {
13725 auto *CloneGEP =
GEP->clone();
13726 if (isa<Instruction>(Vec))
13730 CloneGEP->insertBefore(
GEP);
13731 if (
GEP->hasName())
13732 CloneGEP->takeName(
GEP);
13740 if (Scalar->getType() != Ex->
getType())
13742 MinBWs.
find(E)->second.second);
13743 if (
auto *
I = dyn_cast<Instruction>(Ex))
13744 ScalarToEEs[Scalar].try_emplace(
13746 std::make_pair(
I, cast<Instruction>(ExV)));
13750 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13751 GatherShuffleExtractSeq.
insert(ExI);
13752 CSEBlocks.
insert(ExI->getParent());
13756 assert(isa<FixedVectorType>(Scalar->getType()) &&
13757 isa<InsertElementInst>(Scalar) &&
13758 "In-tree scalar of vector type is not insertelement?");
13759 auto *IE = cast<InsertElementInst>(Scalar);
13767 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13772 if (ExternalUsesAsGEPs.contains(U))
13774 TreeEntry *UseEntry = getTreeEntry(U);
13776 (UseEntry->State == TreeEntry::Vectorize ||
13778 TreeEntry::StridedVectorize) &&
13779 (E->State == TreeEntry::Vectorize ||
13780 E->State == TreeEntry::StridedVectorize) &&
13781 doesInTreeUserNeedToExtract(
13783 cast<Instruction>(UseEntry->Scalars.front()),
13786 "Scalar with nullptr User must be registered in "
13787 "ExternallyUsedValues map or remain as scalar in vectorized "
13789 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13790 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13792 PHI->getParent()->getFirstNonPHIIt());
13795 std::next(VecI->getIterator()));
13799 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13801 Scalar->replaceAllUsesWith(NewInst);
13802 ReplacedExternals.emplace_back(Scalar, NewInst);
13806 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
13809 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13810 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13811 if (!UsedInserts.
insert(VU).second)
13814 auto BWIt = MinBWs.
find(E);
13816 auto *ScalarTy = FTy->getElementType();
13817 auto Key = std::make_pair(Vec, ScalarTy);
13818 auto VecIt = VectorCasts.
find(Key);
13819 if (VecIt == VectorCasts.
end()) {
13821 if (
auto *IVec = dyn_cast<PHINode>(Vec))
13823 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13824 else if (
auto *IVec = dyn_cast<Instruction>(Vec))
13830 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
13831 BWIt->second.second);
13834 Vec = VecIt->second;
13841 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
13848 unsigned Idx = *InsertIdx;
13849 if (It == ShuffledInserts.
end()) {
13851 It = std::next(ShuffledInserts.
begin(),
13852 ShuffledInserts.
size() - 1);
13858 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
13859 if (IEBase !=
User &&
13860 (!IEBase->hasOneUse() ||
13864 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
13866 IEBase = cast<InsertElementInst>(
Base);
13869 "InsertElementInstruction used already.");
13870 Mask[IEIdx] = IEIdx;
13871 Base = IEBase->getOperand(0);
13872 }
while (E == getTreeEntry(
Base));
13875 Base = cast<InsertElementInst>(
Base)->getOperand(0);
13879 auto It = VectorToInsertElement.
find(
Base);
13880 if (It != VectorToInsertElement.
end())
13887 Mask[
Idx] = ExternalUse.Lane;
13888 It->InsertElements.push_back(cast<InsertElementInst>(
User));
13897 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13899 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13900 if (PH->getIncomingValue(
I) == Scalar) {
13902 PH->getIncomingBlock(
I)->getTerminator();
13903 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13905 std::next(VecI->getIterator()));
13909 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13910 PH->setOperand(
I, NewInst);
13915 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13920 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13930 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13931 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
13933 CombinedMask1[
I] = Mask[
I];
13935 CombinedMask2[
I] = Mask[
I] - VF;
13938 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
13939 ShuffleBuilder.
add(V1, CombinedMask1);
13941 ShuffleBuilder.
add(V2, CombinedMask2);
13942 return ShuffleBuilder.
finalize(std::nullopt);
13946 bool ForSingleMask) {
13947 unsigned VF = Mask.size();
13948 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
13950 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
13951 Vec = CreateShuffle(Vec,
nullptr, Mask);
13952 return std::make_pair(Vec,
true);
13954 if (!ForSingleMask) {
13956 for (
unsigned I = 0;
I < VF; ++
I) {
13958 ResizeMask[Mask[
I]] = Mask[
I];
13960 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
13964 return std::make_pair(Vec,
false);
13968 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
13974 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
13975 Value *NewInst = performExtractsShuffleAction<Value>(
13979 return cast<VectorType>(Vec->getType())
13980 ->getElementCount()
13981 .getKnownMinValue();
13986 assert((Vals.size() == 1 || Vals.size() == 2) &&
13987 "Expected exactly 1 or 2 input values.");
13988 if (Vals.size() == 1) {
13991 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13992 ->getNumElements() ||
13993 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13994 return CreateShuffle(Vals.front(), nullptr, Mask);
13995 return Vals.front();
13997 return CreateShuffle(Vals.
front() ? Vals.
front()
13999 Vals.
back(), Mask);
14001 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
14004 if (It != ShuffledInserts[
I].InsertElements.
rend())
14007 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
14008 assert(
II &&
"Must be an insertelement instruction.");
14013 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
14016 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
14017 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
14018 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
14019 II->moveAfter(NewI);
14022 LastInsert->replaceAllUsesWith(NewInst);
14024 IE->replaceUsesOfWith(IE->getOperand(0),
14026 IE->replaceUsesOfWith(IE->getOperand(1),
14030 CSEBlocks.
insert(LastInsert->getParent());
14035 for (
auto &TEPtr : VectorizableTree) {
14036 TreeEntry *Entry = TEPtr.get();
14039 if (Entry->State == TreeEntry::NeedToGather)
14042 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
14045 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14046 Value *Scalar = Entry->Scalars[Lane];
14048 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14049 !isa<GetElementPtrInst>(Scalar))
14052 Type *Ty = Scalar->getType();
14054 for (
User *U : Scalar->users()) {
14058 assert((getTreeEntry(U) ||
14059 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14060 (isa_and_nonnull<Instruction>(U) &&
14061 isDeleted(cast<Instruction>(U)))) &&
14062 "Deleting out-of-tree value");
14066 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
14071 RemovedInsts.
push_back(cast<Instruction>(Scalar));
14077 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14078 V->mergeDIAssignID(RemovedInsts);
14081 InstrElementSize.
clear();
14083 const TreeEntry &RootTE = *VectorizableTree.front().get();
14084 Value *Vec = RootTE.VectorizedValue;
14085 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14086 It != MinBWs.end() &&
14087 ReductionBitWidth != It->second.first) {
14090 ReductionRoot->getIterator());
14094 cast<VectorType>(Vec->
getType())->getElementCount()),
14095 It->second.second);
14102 <<
" gather sequences instructions.\n");
14109 Loop *L = LI->getLoopFor(
I->getParent());
14114 BasicBlock *PreHeader = L->getLoopPreheader();
14122 auto *OpI = dyn_cast<Instruction>(V);
14123 return OpI && L->contains(OpI);
14129 CSEBlocks.
insert(PreHeader);
14144 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
14145 "Different nodes should have different DFS numbers");
14146 return A->getDFSNumIn() <
B->getDFSNumIn();
14156 if (I1->getType() != I2->getType())
14158 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14159 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14161 return I1->isIdenticalTo(I2);
14162 if (SI1->isIdenticalTo(SI2))
14164 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
14165 if (SI1->getOperand(
I) != SI2->getOperand(
I))
14168 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14172 unsigned LastUndefsCnt = 0;
14173 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
14179 NewMask[
I] != SM1[
I])
14182 NewMask[
I] = SM1[
I];
14186 return SM1.
size() - LastUndefsCnt > 1 &&
14190 SM1.
size() - LastUndefsCnt));
14196 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
14199 "Worklist not sorted properly!");
14205 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14206 !GatherShuffleExtractSeq.contains(&In))
14211 bool Replaced =
false;
14214 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14215 DT->
dominates(V->getParent(), In.getParent())) {
14216 In.replaceAllUsesWith(V);
14218 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
14219 if (!NewMask.
empty())
14220 SI->setShuffleMask(NewMask);
14224 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14225 GatherShuffleExtractSeq.contains(V) &&
14226 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14227 DT->
dominates(In.getParent(), V->getParent())) {
14229 V->replaceAllUsesWith(&In);
14231 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14232 if (!NewMask.
empty())
14233 SI->setShuffleMask(NewMask);
14241 Visited.push_back(&In);
14246 GatherShuffleExtractSeq.clear();
14249BoUpSLP::ScheduleData *
14251 ScheduleData *Bundle =
nullptr;
14252 ScheduleData *PrevInBundle =
nullptr;
14253 for (
Value *V : VL) {
14256 ScheduleData *BundleMember = getScheduleData(V);
14258 "no ScheduleData for bundle member "
14259 "(maybe not in same basic block)");
14260 assert(BundleMember->isSchedulingEntity() &&
14261 "bundle member already part of other bundle");
14262 if (PrevInBundle) {
14263 PrevInBundle->NextInBundle = BundleMember;
14265 Bundle = BundleMember;
14269 BundleMember->FirstInBundle = Bundle;
14270 PrevInBundle = BundleMember;
14272 assert(Bundle &&
"Failed to find schedule bundle");
14278std::optional<BoUpSLP::ScheduleData *>
14280 const InstructionsState &S) {
14291 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
14292 ScheduleData *Bundle) {
14298 if (ScheduleEnd != OldScheduleEnd) {
14299 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
14300 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
14305 <<
" in block " << BB->
getName() <<
"\n");
14306 calculateDependencies(Bundle,
true, SLP);
14311 initialFillReadyList(ReadyInsts);
14318 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14319 !ReadyInsts.empty()) {
14320 ScheduleData *Picked = ReadyInsts.pop_back_val();
14321 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14322 "must be ready to schedule");
14323 schedule(Picked, ReadyInsts);
14329 for (
Value *V : VL) {
14332 if (!extendSchedulingRegion(V, S)) {
14339 TryScheduleBundleImpl(
false,
nullptr);
14340 return std::nullopt;
14344 bool ReSchedule =
false;
14345 for (
Value *V : VL) {
14348 ScheduleData *BundleMember = getScheduleData(V);
14350 "no ScheduleData for bundle member (maybe not in same basic block)");
14354 ReadyInsts.remove(BundleMember);
14356 if (!BundleMember->IsScheduled)
14361 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
14362 <<
" was already scheduled\n");
14366 auto *Bundle = buildBundle(VL);
14367 TryScheduleBundleImpl(ReSchedule, Bundle);
14368 if (!Bundle->isReady()) {
14369 cancelScheduling(VL, S.OpValue);
14370 return std::nullopt;
14383 ScheduleData *Bundle = getScheduleData(OpValue);
14384 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
14385 assert(!Bundle->IsScheduled &&
14386 "Can't cancel bundle which is already scheduled");
14387 assert(Bundle->isSchedulingEntity() &&
14389 "tried to unbundle something which is not a bundle");
14392 if (Bundle->isReady())
14393 ReadyInsts.remove(Bundle);
14396 ScheduleData *BundleMember = Bundle;
14397 while (BundleMember) {
14398 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
14399 BundleMember->FirstInBundle = BundleMember;
14400 ScheduleData *Next = BundleMember->NextInBundle;
14401 BundleMember->NextInBundle =
nullptr;
14402 BundleMember->TE =
nullptr;
14403 if (BundleMember->unscheduledDepsInBundle() == 0) {
14404 ReadyInsts.insert(BundleMember);
14406 BundleMember = Next;
14410BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14412 if (ChunkPos >= ChunkSize) {
14413 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14416 return &(ScheduleDataChunks.back()[ChunkPos++]);
14419bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
14420 const InstructionsState &S) {
14421 if (getScheduleData(V,
isOneOf(S, V)))
14424 assert(
I &&
"bundle member must be an instruction");
14427 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14429 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
14430 ScheduleData *ISD = getScheduleData(
I);
14433 assert(isInSchedulingRegion(ISD) &&
14434 "ScheduleData not in scheduling region");
14435 ScheduleData *SD = allocateScheduleDataChunks();
14437 SD->init(SchedulingRegionID, S.OpValue);
14438 ExtraScheduleDataMap[
I][S.OpValue] = SD;
14441 if (CheckScheduleForI(
I))
14443 if (!ScheduleStart) {
14445 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
14447 ScheduleEnd =
I->getNextNode();
14449 CheckScheduleForI(
I);
14450 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14451 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
14459 ++ScheduleStart->getIterator().getReverse();
14464 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
14465 return II->isAssumeLikeIntrinsic();
14468 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14469 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14470 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
14472 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14473 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
14480 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14481 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14483 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
14484 assert(
I->getParent() == ScheduleStart->getParent() &&
14485 "Instruction is in wrong basic block.");
14486 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
14489 CheckScheduleForI(
I);
14494 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
14495 "Expected to reach top of the basic block or instruction down the "
14497 assert(
I->getParent() == ScheduleEnd->getParent() &&
14498 "Instruction is in wrong basic block.");
14499 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
14501 ScheduleEnd =
I->getNextNode();
14503 CheckScheduleForI(
I);
14504 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14505 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14509void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14511 ScheduleData *PrevLoadStore,
14512 ScheduleData *NextLoadStore) {
14513 ScheduleData *CurrentLoadStore = PrevLoadStore;
14518 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14520 SD = allocateScheduleDataChunks();
14521 ScheduleDataMap[
I] = SD;
14524 assert(!isInSchedulingRegion(SD) &&
14525 "new ScheduleData already in scheduling region");
14526 SD->init(SchedulingRegionID,
I);
14528 if (
I->mayReadOrWriteMemory() &&
14529 (!isa<IntrinsicInst>(
I) ||
14530 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14531 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14532 Intrinsic::pseudoprobe))) {
14534 if (CurrentLoadStore) {
14535 CurrentLoadStore->NextLoadStore = SD;
14537 FirstLoadStoreInRegion = SD;
14539 CurrentLoadStore = SD;
14542 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14543 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14544 RegionHasStackSave =
true;
14546 if (NextLoadStore) {
14547 if (CurrentLoadStore)
14548 CurrentLoadStore->NextLoadStore = NextLoadStore;
14550 LastLoadStoreInRegion = CurrentLoadStore;
14554void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14555 bool InsertInReadyList,
14557 assert(SD->isSchedulingEntity());
14562 while (!WorkList.
empty()) {
14564 for (ScheduleData *BundleMember = SD; BundleMember;
14565 BundleMember = BundleMember->NextInBundle) {
14566 assert(isInSchedulingRegion(BundleMember));
14567 if (BundleMember->hasValidDependencies())
14572 BundleMember->Dependencies = 0;
14573 BundleMember->resetUnscheduledDeps();
14576 if (BundleMember->OpValue != BundleMember->Inst) {
14577 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14578 BundleMember->Dependencies++;
14579 ScheduleData *DestBundle = UseSD->FirstInBundle;
14580 if (!DestBundle->IsScheduled)
14581 BundleMember->incrementUnscheduledDeps(1);
14582 if (!DestBundle->hasValidDependencies())
14586 for (
User *U : BundleMember->Inst->
users()) {
14587 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14588 BundleMember->Dependencies++;
14589 ScheduleData *DestBundle = UseSD->FirstInBundle;
14590 if (!DestBundle->IsScheduled)
14591 BundleMember->incrementUnscheduledDeps(1);
14592 if (!DestBundle->hasValidDependencies())
14599 auto *DepDest = getScheduleData(
I);
14600 assert(DepDest &&
"must be in schedule window");
14601 DepDest->ControlDependencies.push_back(BundleMember);
14602 BundleMember->Dependencies++;
14603 ScheduleData *DestBundle = DepDest->FirstInBundle;
14604 if (!DestBundle->IsScheduled)
14605 BundleMember->incrementUnscheduledDeps(1);
14606 if (!DestBundle->hasValidDependencies())
14614 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14615 I != ScheduleEnd;
I =
I->getNextNode()) {
14620 MakeControlDependent(
I);
14628 if (RegionHasStackSave) {
14632 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14633 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14634 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14635 I != ScheduleEnd;
I =
I->getNextNode()) {
14636 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14637 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14642 if (!isa<AllocaInst>(
I))
14646 MakeControlDependent(
I);
14655 if (isa<AllocaInst>(BundleMember->Inst) ||
14656 BundleMember->Inst->mayReadOrWriteMemory()) {
14657 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14658 I != ScheduleEnd;
I =
I->getNextNode()) {
14659 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14660 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14664 MakeControlDependent(
I);
14671 ScheduleData *DepDest = BundleMember->NextLoadStore;
14676 "NextLoadStore list for non memory effecting bundle?");
14678 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14679 unsigned NumAliased = 0;
14680 unsigned DistToSrc = 1;
14682 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14683 assert(isInSchedulingRegion(DepDest));
14693 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14695 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14702 DepDest->MemoryDependencies.push_back(BundleMember);
14703 BundleMember->Dependencies++;
14704 ScheduleData *DestBundle = DepDest->FirstInBundle;
14705 if (!DestBundle->IsScheduled) {
14706 BundleMember->incrementUnscheduledDeps(1);
14708 if (!DestBundle->hasValidDependencies()) {
14731 if (InsertInReadyList && SD->isReady()) {
14732 ReadyInsts.insert(SD);
14739void BoUpSLP::BlockScheduling::resetSchedule() {
14741 "tried to reset schedule on block which has not been scheduled");
14742 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14743 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14744 assert(isInSchedulingRegion(SD) &&
14745 "ScheduleData not in scheduling region");
14746 SD->IsScheduled =
false;
14747 SD->resetUnscheduledDeps();
14750 ReadyInsts.clear();
14753void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14754 if (!BS->ScheduleStart)
14757 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14764 BS->resetSchedule();
14771 struct ScheduleDataCompare {
14772 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14773 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14776 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14781 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14782 I =
I->getNextNode()) {
14783 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14784 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14787 SD->isPartOfBundle() ==
14789 "scheduler and vectorizer bundle mismatch");
14790 SD->FirstInBundle->SchedulingPriority =
Idx++;
14792 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14793 BS->calculateDependencies(SD,
false,
this);
14796 BS->initialFillReadyList(ReadyInsts);
14798 Instruction *LastScheduledInst = BS->ScheduleEnd;
14801 while (!ReadyInsts.empty()) {
14802 ScheduleData *Picked = *ReadyInsts.begin();
14803 ReadyInsts.erase(ReadyInsts.begin());
14807 for (ScheduleData *BundleMember = Picked; BundleMember;
14808 BundleMember = BundleMember->NextInBundle) {
14812 LastScheduledInst = PickedInst;
14815 BS->schedule(Picked, ReadyInsts);
14819#ifdef EXPENSIVE_CHECKS
14823#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14825 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
14826 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
14827 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14828 assert(SD->IsScheduled &&
"must be scheduled at this point");
14835 BS->ScheduleStart =
nullptr;
14842 if (
auto *Store = dyn_cast<StoreInst>(V))
14843 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14845 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
14848 auto E = InstrElementSize.
find(V);
14849 if (E != InstrElementSize.
end())
14858 if (
auto *
I = dyn_cast<Instruction>(V)) {
14866 Value *FirstNonBool =
nullptr;
14867 while (!Worklist.
empty()) {
14872 auto *Ty =
I->getType();
14873 if (isa<VectorType>(Ty))
14875 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
14882 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
14883 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
14891 for (
Use &U :
I->operands()) {
14892 if (
auto *J = dyn_cast<Instruction>(U.get()))
14893 if (Visited.
insert(J).second &&
14894 (isa<PHINode>(
I) || J->getParent() == Parent)) {
14898 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
14899 FirstNonBool = U.get();
14910 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
14912 Width =
DL->getTypeSizeInBits(V->getType());
14916 InstrElementSize[
I] = Width;
14921bool BoUpSLP::collectValuesToDemote(
14922 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
14924 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
14925 bool IsTruncRoot)
const {
14927 if (
all_of(E.Scalars, IsaPred<Constant>))
14930 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
14939 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
14940 return !isKnownNonNegative(R, SimplifyQuery(*DL));
14942 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
14949 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
14955 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14958 if (
auto *
I = dyn_cast<Instruction>(V)) {
14960 unsigned BitWidth2 =
14961 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14962 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
14968 BitWidth1 = std::min(BitWidth1, BitWidth2);
14973 using namespace std::placeholders;
14974 auto FinalAnalysis = [&]() {
14975 if (!IsProfitableToDemote)
14978 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
14980 if (Res && E.State == TreeEntry::NeedToGather) {
14984 for (
Value *V : E.Scalars) {
14985 auto *EE = dyn_cast<ExtractElementInst>(V);
14988 UniqueBases.
insert(EE->getVectorOperand());
14990 const unsigned VF = E.Scalars.size();
14991 Type *OrigScalarTy = E.Scalars.front()->getType();
14992 if (UniqueBases.
size() <= 2 ||
15000 if (E.State == TreeEntry::NeedToGather || !Visited.
insert(&E).second ||
15002 return all_of(V->users(), [&](User *U) {
15003 return isa<InsertElementInst>(U) && !getTreeEntry(U);
15006 return FinalAnalysis();
15009 return !all_of(V->users(), [=](User *U) {
15010 return getTreeEntry(U) ||
15011 (UserIgnoreList && UserIgnoreList->contains(U)) ||
15012 (!isa<CmpInst>(U) && U->getType()->isSized() &&
15013 !U->getType()->isScalableTy() &&
15014 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
15015 }) && !IsPotentiallyTruncated(V,
BitWidth);
15020 bool &NeedToExit) {
15021 NeedToExit =
false;
15022 unsigned InitLevel = MaxDepthLevel;
15024 unsigned Level = InitLevel;
15025 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
15026 ToDemote, Visited, Level, IsProfitableToDemote,
15028 if (!IsProfitableToDemote)
15031 if (!FinalAnalysis())
15035 MaxDepthLevel = std::max(MaxDepthLevel, Level);
15039 auto AttemptCheckBitwidth =
15042 NeedToExit =
false;
15043 unsigned BestFailBitwidth = 0;
15045 if (Checker(
BitWidth, OrigBitWidth))
15047 if (BestFailBitwidth == 0 && FinalAnalysis())
15051 if (BestFailBitwidth == 0) {
15062 auto TryProcessInstruction =
15069 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15074 if (E.UserTreeIndices.size() > 1 &&
15075 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15078 bool NeedToExit =
false;
15079 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15083 if (!ProcessOperands(
Operands, NeedToExit))
15092 return IsProfitableToDemote;
15094 switch (E.getOpcode()) {
15098 case Instruction::Trunc:
15099 if (IsProfitableToDemoteRoot)
15100 IsProfitableToDemote =
true;
15101 return TryProcessInstruction(
BitWidth);
15102 case Instruction::ZExt:
15103 case Instruction::SExt:
15104 IsProfitableToDemote =
true;
15105 return TryProcessInstruction(
BitWidth);
15109 case Instruction::Add:
15110 case Instruction::Sub:
15111 case Instruction::Mul:
15112 case Instruction::And:
15113 case Instruction::Or:
15114 case Instruction::Xor: {
15115 return TryProcessInstruction(
15116 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15118 case Instruction::Shl: {
15123 auto *I = cast<Instruction>(V);
15124 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15125 return AmtKnownBits.getMaxValue().ult(BitWidth);
15128 return TryProcessInstruction(
15129 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15131 case Instruction::LShr: {
15135 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15137 auto *I = cast<Instruction>(V);
15138 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15139 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15140 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15141 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15142 SimplifyQuery(*DL));
15145 return TryProcessInstruction(
15146 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15149 case Instruction::AShr: {
15153 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15155 auto *I = cast<Instruction>(V);
15156 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15157 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15158 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15159 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15163 return TryProcessInstruction(
15164 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15167 case Instruction::UDiv:
15168 case Instruction::URem: {
15170 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15173 auto *I = cast<Instruction>(V);
15174 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15175 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15176 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15179 return TryProcessInstruction(
15180 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15184 case Instruction::Select: {
15185 return TryProcessInstruction(
15186 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15191 case Instruction::PHI: {
15192 const unsigned NumOps = E.getNumOperands();
15195 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
15197 return TryProcessInstruction(
BitWidth, Ops);
15200 case Instruction::Call: {
15201 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15205 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
15206 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
15210 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
15213 auto *I = cast<Instruction>(V);
15214 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15215 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15216 return MaskedValueIsZero(I->getOperand(0), Mask,
15217 SimplifyQuery(*DL)) &&
15218 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15220 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
15221 "Expected min/max intrinsics only.");
15222 unsigned SignBits = OrigBitWidth -
BitWidth;
15228 return SignBits <= Op0SignBits &&
15229 ((SignBits != Op0SignBits &&
15233 SignBits <= Op1SignBits &&
15234 ((SignBits != Op1SignBits &&
15239 if (
ID != Intrinsic::abs) {
15240 Operands.push_back(getOperandEntry(&E, 1));
15241 CallChecker = CompChecker;
15244 std::numeric_limits<InstructionCost::CostType>::max();
15246 unsigned VF = E.Scalars.size();
15256 if (
Cost < BestCost) {
15262 [[maybe_unused]]
bool NeedToExit;
15263 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15273 return FinalAnalysis();
15280 bool IsStoreOrInsertElt =
15281 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15282 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15283 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15284 ExtraBitWidthNodes.
size() <= 1 &&
15285 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15286 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15289 unsigned NodeIdx = 0;
15290 if (IsStoreOrInsertElt &&
15291 VectorizableTree.front()->State != TreeEntry::NeedToGather)
15295 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
15296 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
15297 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15300 static_cast<int>(NodeIdx);
15306 bool IsTruncRoot =
false;
15307 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15309 if (NodeIdx != 0 &&
15310 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15311 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
15312 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
15313 IsTruncRoot =
true;
15315 IsProfitableToDemoteRoot =
true;
15320 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
15324 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
15325 bool IsProfitableToDemoteRoot,
unsigned Opcode,
15326 unsigned Limit,
bool IsTruncRoot,
15327 bool IsSignedCmp) {
15329 unsigned VF = E.getVectorFactor();
15330 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15331 if (!TreeRootIT || !Opcode)
15335 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
15338 unsigned NumParts =
15344 unsigned MaxBitWidth = 1u;
15352 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
15353 KnownBits Known = computeKnownBits(R, *DL);
15354 return Known.isNonNegative();
15359 for (
Value *Root : E.Scalars) {
15362 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15378 if (!IsKnownPositive)
15382 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15384 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15387 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15392 if (NumParts > 1 &&
15398 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15399 Opcode == Instruction::SExt ||
15400 Opcode == Instruction::ZExt || NumParts > 1;
15405 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15406 bool NeedToDemote = IsProfitableToDemote;
15408 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15409 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15411 (MaxDepthLevel <= Limit &&
15412 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15413 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15414 DL->getTypeSizeInBits(TreeRootIT) /
15415 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15421 MaxBitWidth =
bit_ceil(MaxBitWidth);
15423 return MaxBitWidth;
15430 if (UserIgnoreList &&
15431 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15432 for (
Value *V : *UserIgnoreList) {
15434 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
15435 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15438 unsigned BitWidth2 = BitWidth1;
15441 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15443 ReductionBitWidth =
15444 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15446 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15447 ReductionBitWidth = 8;
15449 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
15451 bool IsTopRoot = NodeIdx == 0;
15452 while (NodeIdx < VectorizableTree.size() &&
15453 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15454 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15457 IsTruncRoot =
true;
15459 bool IsSignedCmp =
false;
15460 while (NodeIdx < VectorizableTree.size()) {
15462 unsigned Limit = 2;
15463 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15465 ReductionBitWidth ==
15466 DL->getTypeSizeInBits(
15467 VectorizableTree.front()->Scalars.front()->getType()))
15469 unsigned MaxBitWidth = ComputeMaxBitWidth(
15470 *VectorizableTree[NodeIdx].
get(), IsTopRoot, IsProfitableToDemoteRoot,
15471 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15472 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
15473 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15474 ReductionBitWidth =
bit_ceil(MaxBitWidth);
15475 else if (MaxBitWidth == 0)
15476 ReductionBitWidth = 0;
15479 for (
unsigned Idx : RootDemotes) {
15481 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
15482 if (OrigBitWidth > MaxBitWidth) {
15490 RootDemotes.clear();
15492 IsProfitableToDemoteRoot =
true;
15494 if (ExtraBitWidthNodes.
empty()) {
15495 NodeIdx = VectorizableTree.size();
15497 unsigned NewIdx = 0;
15499 NewIdx = *ExtraBitWidthNodes.
begin();
15500 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
15501 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
15504 NodeIdx < VectorizableTree.size() &&
15505 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15508 EI.
UserTE->getOpcode() == Instruction::Trunc &&
15509 !EI.
UserTE->isAltShuffle();
15512 NodeIdx < VectorizableTree.size() &&
15513 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15515 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
15517 auto *IC = dyn_cast<ICmpInst>(V);
15520 !isKnownNonNegative(IC->getOperand(0),
15521 SimplifyQuery(*DL)) ||
15522 !isKnownNonNegative(IC->getOperand(1),
15523 SimplifyQuery(*DL)));
15530 if (MaxBitWidth == 0 ||
15532 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
15533 if (UserIgnoreList)
15540 for (
unsigned Idx : ToDemote) {
15541 TreeEntry *TE = VectorizableTree[
Idx].get();
15544 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
15545 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15563 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15588 DL = &
F.getDataLayout();
15592 bool Changed =
false;
15598 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15603 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15606 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15610 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15619 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15621 R.clearReductionData();
15622 collectSeedInstructions(BB);
15625 if (!Stores.empty()) {
15627 <<
" underlying objects.\n");
15628 Changed |= vectorizeStoreChains(R);
15632 Changed |= vectorizeChainsInBlock(BB, R);
15637 if (!GEPs.
empty()) {
15639 <<
" underlying objects.\n");
15640 Changed |= vectorizeGEPIndices(BB, R);
15645 R.optimizeGatherSequence();
15653 unsigned Idx,
unsigned MinVF,
15658 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15659 unsigned VF = Chain.
size();
15673 for (
Value *V : Chain)
15674 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
15677 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
15682 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15683 (!S.MainOp->isSafeToRemove() ||
15686 return !isa<ExtractElementInst>(V) &&
15687 (V->getNumUses() > Chain.size() ||
15688 any_of(V->users(), [&](User *U) {
15689 return !Stores.contains(U);
15692 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
15693 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15697 if (
R.isLoadCombineCandidate(Chain))
15699 R.buildTree(Chain);
15701 if (
R.isTreeTinyAndNotFullyVectorizable()) {
15702 if (
R.isGathered(Chain.front()) ||
15703 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15704 return std::nullopt;
15705 Size =
R.getTreeSize();
15708 R.reorderTopToBottom();
15709 R.reorderBottomToTop();
15710 R.buildExternalUses();
15712 R.computeMinimumValueSizes();
15713 R.transformNodes();
15715 Size =
R.getTreeSize();
15716 if (S.getOpcode() == Instruction::Load)
15724 using namespace ore;
15727 cast<StoreInst>(Chain[0]))
15728 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
15729 <<
" and with tree size "
15730 <<
NV(
"TreeSize",
R.getTreeSize()));
15744 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15745 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15746 unsigned Size = First ? Val.first : Val.second;
15758 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15759 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15760 unsigned P = First ? Val.first : Val.second;
15763 return V + (P - Mean) * (P - Mean);
15766 return Dev * 81 / (Mean * Mean) == 0;
15769bool SLPVectorizerPass::vectorizeStores(
15771 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15776 bool Changed =
false;
15778 struct StoreDistCompare {
15779 bool operator()(
const std::pair<unsigned, int> &Op1,
15780 const std::pair<unsigned, int> &Op2)
const {
15781 return Op1.second < Op2.second;
15786 using StoreIndexToDistSet =
15787 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15788 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
15793 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
15795 PrevDist =
Data.second;
15796 if (
Idx != Set.size() - 1)
15801 Operands.push_back(Stores[DataVar.first]);
15802 PrevDist = DataVar.second;
15807 .
insert({Operands.front(),
15808 cast<StoreInst>(Operands.front())->getValueOperand(),
15810 cast<StoreInst>(Operands.back())->getValueOperand(),
15815 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
15816 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
15820 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15821 unsigned MaxRegVF = MaxVF;
15823 Type *StoreTy =
Store->getValueOperand()->getType();
15824 Type *ValueTy = StoreTy;
15825 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
15826 ValueTy = Trunc->getSrcTy();
15827 if (ValueTy == StoreTy &&
15828 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
15830 unsigned MinVF = std::max<unsigned>(
15832 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15835 if (MaxVF < MinVF) {
15836 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
15838 <<
"MinVF (" << MinVF <<
")\n");
15842 unsigned NonPowerOf2VF = 0;
15847 unsigned CandVF =
Operands.size();
15849 NonPowerOf2VF = CandVF;
15854 unsigned Size = MinVF;
15856 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
15860 unsigned Repeat = 0;
15861 constexpr unsigned MaxAttempts = 4;
15863 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
15864 P.first =
P.second = 1;
15867 auto IsNotVectorized = [](
bool First,
15868 const std::pair<unsigned, unsigned> &
P) {
15869 return First ?
P.first > 0 :
P.second > 0;
15871 auto IsVectorized = [](
bool First,
15872 const std::pair<unsigned, unsigned> &
P) {
15873 return First ?
P.first == 0 :
P.second == 0;
15875 auto VFIsProfitable = [](
bool First,
unsigned Size,
15876 const std::pair<unsigned, unsigned> &
P) {
15879 auto FirstSizeSame = [](
unsigned Size,
15880 const std::pair<unsigned, unsigned> &
P) {
15881 return Size ==
P.first;
15885 bool RepeatChanged =
false;
15886 bool AnyProfitableGraph =
false;
15887 for (
unsigned Size : CandidateVFs) {
15888 AnyProfitableGraph =
false;
15889 unsigned StartIdx = std::distance(
15890 RangeSizes.begin(),
15891 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
15892 std::placeholders::_1)));
15893 while (StartIdx <
End) {
15895 std::distance(RangeSizes.begin(),
15896 find_if(RangeSizes.drop_front(StartIdx),
15897 std::bind(IsVectorized,
Size >= MaxRegVF,
15898 std::placeholders::_1)));
15899 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
15900 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
15902 Size >= MaxRegVF)) {
15909 return cast<StoreInst>(V)
15910 ->getValueOperand()
15912 cast<StoreInst>(Slice.
front())
15913 ->getValueOperand()
15916 "Expected all operands of same type.");
15917 if (!NonSchedulable.empty()) {
15918 auto [NonSchedSizeMax, NonSchedSizeMin] =
15919 NonSchedulable.lookup(Slice.
front());
15920 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
15921 Cnt += NonSchedSizeMax;
15926 std::optional<bool> Res =
15927 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15931 .first->getSecond()
15939 AnyProfitableGraph = RepeatChanged = Changed =
true;
15943 [](std::pair<unsigned, unsigned> &
P) {
15944 P.first = P.second = 0;
15946 if (Cnt < StartIdx + MinVF) {
15947 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15948 [](std::pair<unsigned, unsigned> &
P) {
15949 P.first = P.second = 0;
15951 StartIdx = Cnt +
Size;
15953 if (Cnt > Sz -
Size - MinVF) {
15955 [](std::pair<unsigned, unsigned> &
P) {
15956 P.first = P.second = 0;
15965 if (
Size > 2 && Res &&
15967 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
15968 std::placeholders::_1))) {
15974 if (
Size > MaxRegVF && TreeSize > 1 &&
15976 std::bind(FirstSizeSame, TreeSize,
15977 std::placeholders::_1))) {
15979 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15985 [&](std::pair<unsigned, unsigned> &
P) {
15986 if (Size >= MaxRegVF)
15987 P.second = std::max(P.second, TreeSize);
15989 P.first = std::max(P.first, TreeSize);
15992 AnyProfitableGraph =
true;
15994 if (StartIdx >=
End)
15996 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15997 AnyProfitableGraph =
true;
15998 StartIdx = std::distance(
15999 RangeSizes.begin(),
16000 find_if(RangeSizes.drop_front(Sz),
16001 std::bind(IsNotVectorized,
Size >= MaxRegVF,
16002 std::placeholders::_1)));
16004 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
16008 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
16009 return P.first == 0 &&
P.second == 0;
16013 if (Repeat >= MaxAttempts ||
16014 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16016 constexpr unsigned StoresLimit = 64;
16017 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
16019 static_cast<unsigned>(
16022 RangeSizes.begin(),
16023 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
16024 std::placeholders::_1))) +
16027 if (VF > MaxTotalNum || VF >= StoresLimit)
16029 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
16031 P.first = std::max(
P.second,
P.first);
16035 CandidateVFs.clear();
16036 CandidateVFs.push_back(VF);
16083 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16085 Stores[Set.first]->getValueOperand()->getType(),
16086 Stores[Set.first]->getPointerOperand(),
16087 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
16091 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
16092 if (It == Set.second.end()) {
16093 Set.second.emplace(
Idx, *Diff);
16097 TryToVectorize(Set.second);
16098 StoreIndexToDistSet PrevSet;
16099 PrevSet.swap(Set.second);
16101 Set.second.emplace(
Idx, 0);
16104 unsigned StartIdx = It->first + 1;
16109 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
16111 if (Pair.first <= It->first ||
16112 VectorizedStores.
contains(Stores[Pair.first]))
16114 unsigned BI = Pair.first - StartIdx;
16115 UsedStores.set(BI);
16116 Dists[BI] = Pair.second - It->second;
16118 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
16119 unsigned BI =
I - StartIdx;
16120 if (UsedStores.test(BI))
16121 Set.second.emplace(
I, Dists[BI]);
16125 auto &Res = SortedStores.emplace_back();
16127 Res.second.emplace(
Idx, 0);
16133 SI->getValueOperand()->getType()) {
16134 for (
auto &Set : SortedStores)
16135 TryToVectorize(Set.second);
16136 SortedStores.clear();
16139 FillStoresSet(
I, SI);
16143 for (
auto &Set : SortedStores)
16144 TryToVectorize(Set.second);
16149void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
16160 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
16161 if (!
SI->isSimple())
16171 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
16172 if (
GEP->getNumIndices() != 1)
16175 if (isa<Constant>(
Idx))
16179 if (
GEP->getType()->isVectorTy())
16191 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
16192 << VL.
size() <<
".\n");
16197 if (!S.getOpcode())
16203 for (
Value *V : VL) {
16204 Type *Ty =
V->getType();
16208 R.getORE()->emit([&]() {
16209 std::string TypeStr;
16213 <<
"Cannot SLP vectorize list: type "
16214 << rso.str() +
" is unsupported by vectorizer";
16220 unsigned Sz =
R.getVectorElementSize(I0);
16221 unsigned MinVF =
R.getMinVF(Sz);
16222 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
16223 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16225 R.getORE()->emit([&]() {
16227 <<
"Cannot SLP vectorize list: vectorization factor "
16228 <<
"less than 2 is not supported";
16233 bool Changed =
false;
16234 bool CandidateFound =
false;
16236 Type *ScalarTy = VL[0]->getType();
16237 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16238 ScalarTy =
IE->getOperand(1)->getType();
16240 unsigned NextInst = 0, MaxInst = VL.size();
16241 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16248 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
16249 unsigned ActualVF = std::min(MaxInst -
I, VF);
16254 if (MaxVFOnly && ActualVF < MaxVF)
16256 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16262 auto *
I = dyn_cast<Instruction>(V);
16263 return I &&
R.isDeleted(
I);
16267 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
16271 if (
R.isTreeTinyAndNotFullyVectorizable())
16273 R.reorderTopToBottom();
16274 R.reorderBottomToTop(
16275 !isa<InsertElementInst>(Ops.
front()) &&
16276 !
R.doesRootHaveInTreeUses());
16277 R.buildExternalUses();
16279 R.computeMinimumValueSizes();
16280 R.transformNodes();
16282 CandidateFound =
true;
16283 MinCost = std::min(MinCost,
Cost);
16286 <<
" for VF=" << ActualVF <<
"\n");
16290 cast<Instruction>(Ops[0]))
16291 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
16292 <<
" and with tree size "
16293 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
16304 if (!Changed && CandidateFound) {
16305 R.getORE()->emit([&]() {
16307 <<
"List vectorization was possible but not beneficial with cost "
16308 <<
ore::NV(
"Cost", MinCost) <<
" >= "
16311 }
else if (!Changed) {
16312 R.getORE()->emit([&]() {
16314 <<
"Cannot SLP vectorize list: vectorization was impossible"
16315 <<
" with available vectorization factors";
16325 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
16331 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
16332 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
16333 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
16340 auto *
A = dyn_cast<BinaryOperator>(Op0);
16341 auto *
B = dyn_cast<BinaryOperator>(Op1);
16343 if (
A &&
B &&
B->hasOneUse()) {
16344 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
16345 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
16346 if (B0 && B0->getParent() ==
P)
16348 if (B1 && B1->getParent() ==
P)
16352 if (
B &&
A &&
A->hasOneUse()) {
16353 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
16354 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
16355 if (A0 && A0->getParent() ==
P)
16357 if (A1 && A1->getParent() ==
P)
16361 if (Candidates.
size() == 1)
16362 return tryToVectorizeList({Op0, Op1},
R);
16365 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
16366 if (!BestCandidate)
16368 return tryToVectorizeList(
16369 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
16403 ReductionOpsListType ReductionOps;
16415 bool IsSupportedHorRdxIdentityOp =
false;
16426 return isa<SelectInst>(
I) &&
16432 if (Kind == RecurKind::None)
16440 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16444 return I->getFastMathFlags().noNaNs();
16447 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16450 return I->isAssociative();
16459 return I->getOperand(2);
16460 return I->getOperand(
Index);
16468 case RecurKind::Or:
16474 case RecurKind::And:
16480 case RecurKind::Add:
16481 case RecurKind::Mul:
16482 case RecurKind::Xor:
16483 case RecurKind::FAdd:
16484 case RecurKind::FMul:
16487 case RecurKind::FMax:
16489 case RecurKind::FMin:
16491 case RecurKind::FMaximum:
16493 case RecurKind::FMinimum:
16495 case RecurKind::SMax:
16501 case RecurKind::SMin:
16507 case RecurKind::UMax:
16513 case RecurKind::UMin:
16528 const ReductionOpsListType &ReductionOps) {
16529 bool UseSelect = ReductionOps.size() == 2 ||
16531 (ReductionOps.size() == 1 &&
16532 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16533 assert((!UseSelect || ReductionOps.size() != 2 ||
16534 isa<SelectInst>(ReductionOps[1][0])) &&
16535 "Expected cmp + select pairs for reduction");
16538 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
16552 auto *
I = dyn_cast<Instruction>(V);
16554 return RecurKind::None;
16556 return RecurKind::Add;
16558 return RecurKind::Mul;
16561 return RecurKind::And;
16564 return RecurKind::Or;
16566 return RecurKind::Xor;
16568 return RecurKind::FAdd;
16570 return RecurKind::FMul;
16573 return RecurKind::FMax;
16575 return RecurKind::FMin;
16578 return RecurKind::FMaximum;
16580 return RecurKind::FMinimum;
16586 return RecurKind::SMax;
16588 return RecurKind::SMin;
16590 return RecurKind::UMax;
16592 return RecurKind::UMin;
16594 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
16616 if (!isa<ExtractElementInst>(
RHS) ||
16618 return RecurKind::None;
16620 if (!isa<ExtractElementInst>(
LHS) ||
16622 return RecurKind::None;
16624 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
16625 return RecurKind::None;
16629 return RecurKind::None;
16634 return RecurKind::None;
16637 return RecurKind::SMax;
16640 return RecurKind::SMin;
16643 return RecurKind::UMax;
16646 return RecurKind::UMin;
16649 return RecurKind::None;
16653 static unsigned getFirstOperandIndex(
Instruction *
I) {
16654 return isCmpSelMinMax(
I) ? 1 : 0;
16660 return isCmpSelMinMax(
I) ? 3 : 2;
16666 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
16667 auto *Sel = cast<SelectInst>(
I);
16668 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
16669 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
16671 return I->getParent() == BB;
16675 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
16676 if (IsCmpSelMinMax) {
16679 if (
auto *Sel = dyn_cast<SelectInst>(
I))
16680 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
16681 return I->hasNUses(2);
16685 return I->hasOneUse();
16690 if (isCmpSelMinMax(
I))
16691 ReductionOps.assign(2, ReductionOpsType());
16693 ReductionOps.assign(1, ReductionOpsType());
16698 if (isCmpSelMinMax(
I)) {
16699 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
16700 ReductionOps[1].emplace_back(
I);
16702 ReductionOps[0].emplace_back(
I);
16707 int Sz = Data.size();
16708 auto *
I = dyn_cast<Instruction>(Data.front());
16709 return Sz > 1 ||
isConstant(Data.front()) ||
16720 RdxKind = HorizontalReduction::getRdxKind(Root);
16721 if (!isVectorizable(RdxKind, Root))
16732 if (
auto *Sel = dyn_cast<SelectInst>(Root))
16733 if (!Sel->getCondition()->hasOneUse())
16736 ReductionRoot = Root;
16741 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16750 for (
int I = getFirstOperandIndex(TreeN),
16751 End = getNumberOfOperands(TreeN);
16753 Value *EdgeVal = getRdxOperand(TreeN,
I);
16754 ReducedValsToOps[EdgeVal].push_back(TreeN);
16755 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16758 !hasSameParent(EdgeInst, BB)) {
16759 ExtraArgs.push_back(EdgeVal);
16766 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16767 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16768 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16769 !isVectorizable(RdxKind, EdgeInst) ||
16770 (
R.isAnalyzedReductionRoot(EdgeInst) &&
16771 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16772 PossibleReducedVals.push_back(EdgeVal);
16775 ReductionOps.push_back(EdgeInst);
16784 PossibleReducedVals;
16785 initReductionOps(Root);
16790 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
16793 auto LIt = LoadsMap.
find(
Ptr);
16794 if (LIt != LoadsMap.
end()) {
16795 for (
LoadInst *RLI : LIt->second) {
16801 for (
LoadInst *RLI : LIt->second) {
16805 DoNotReverseVals.
insert(RLI);
16809 if (LIt->second.size() > 2) {
16811 hash_value(LIt->second.back()->getPointerOperand());
16812 DoNotReverseVals.
insert(LIt->second.back());
16817 LoadKeyUsed.
insert(Key);
16822 while (!Worklist.empty()) {
16827 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16830 if (
Args.size() < 2) {
16831 addReductionOps(TreeN);
16833 if (!
Args.empty()) {
16834 assert(
Args.size() == 1 &&
"Expected only single argument.");
16835 ExtraArgs[TreeN] =
Args.front();
16839 for (
Value *V : PossibleRedVals) {
16843 ++PossibleReducedVals[
Key][
Idx]
16844 .
insert(std::make_pair(V, 0))
16847 Worklist.append(PossibleReductionOps.
rbegin(),
16848 PossibleReductionOps.
rend());
16853 ++PossibleReducedVals[
Key][
Idx]
16854 .
insert(std::make_pair(TreeN, 0))
16858 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
16861 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
16862 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
16864 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
16867 auto RedValsVect = It->second.takeVector();
16869 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
16870 PossibleRedValsVect.
back().append(Data.second, Data.first);
16872 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
16873 return P1.size() > P2.size();
16877 if (isGoodForReduction(Data) ||
16878 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16879 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16881 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16885 NewIdx = ReducedVals.
size();
16888 if (DoNotReverseVals.
contains(Data.front()))
16889 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
16891 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
16893 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
16908 constexpr int ReductionLimit = 4;
16909 constexpr unsigned RegMaxNumber = 4;
16910 constexpr unsigned RedValsMaxNumber = 128;
16914 unsigned NumReducedVals =
16915 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
16917 if (!isGoodForReduction(Vals))
16919 return Num + Vals.size();
16921 if (NumReducedVals < ReductionLimit &&
16926 for (ReductionOpsType &RdxOps : ReductionOps)
16927 for (
Value *RdxOp : RdxOps)
16928 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16939 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
16942 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
16945 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16946 assert(Pair.first &&
"DebugLoc must be set.");
16947 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16948 TrackedVals.
try_emplace(Pair.second, Pair.second);
16953 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
16954 assert(isa<SelectInst>(RdxRootInst) &&
16955 "Expected min/max reduction to have select root instruction");
16956 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16957 assert(isa<Instruction>(ScalarCond) &&
16958 "Expected min/max reduction to have compare condition");
16959 return cast<Instruction>(ScalarCond);
16963 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
16964 if (VectorizedTree) {
16967 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16968 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16971 auto It = ReducedValsToOps.
find(Res);
16972 if (It != ReducedValsToOps.
end() &&
16978 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
16984 bool AnyBoolLogicOp =
16986 return isBoolLogicOp(cast<Instruction>(V));
16990 ExternallyUsedValues[ReductionRoot];
16992 ReductionOps.front().size());
16993 for (ReductionOpsType &RdxOps : ReductionOps)
16994 for (
Value *RdxOp : RdxOps) {
16997 IgnoreList.insert(RdxOp);
17002 for (
Value *U : IgnoreList)
17003 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
17004 RdxFMF &= FPMO->getFastMathFlags();
17005 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
17010 for (
Value *V : Candidates)
17011 TrackedVals.try_emplace(V, V);
17017 Value *VectorizedTree =
nullptr;
17018 bool CheckForReusedReductionOps =
false;
17020 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
17026 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
17027 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
17032 auto *Inst = dyn_cast<Instruction>(RdxVal);
17034 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
17035 (S.getOpcode() && !Inst))
17038 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
17040 bool ShuffledExtracts =
false;
17042 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17044 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
17045 if (NextS.getOpcode() == Instruction::ExtractElement &&
17046 !NextS.isAltShuffle()) {
17048 for (
Value *RV : ReducedVals[
I + 1]) {
17049 Value *RdxVal = TrackedVals.find(RV)->second;
17053 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
17054 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17056 CommonCandidates.push_back(RdxVal);
17057 TrackedToOrig.try_emplace(RdxVal, RV);
17062 Candidates.
swap(CommonCandidates);
17063 ShuffledExtracts =
true;
17072 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
17074 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
17075 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17076 if (
auto *ResI = dyn_cast<Instruction>(Res))
17077 V.analyzedReductionRoot(ResI);
17079 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17083 unsigned NumReducedVals = Candidates.
size();
17084 if (NumReducedVals < ReductionLimit &&
17091 IsSupportedHorRdxIdentityOp =
17093 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17096 if (IsSupportedHorRdxIdentityOp)
17097 for (
Value *V : Candidates)
17098 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
17109 bool SameScaleFactor =
false;
17110 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17111 SameValuesCounter.
size() != Candidates.size();
17112 if (OptReusedScalars) {
17114 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17115 RdxKind == RecurKind::Xor) &&
17117 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
17118 return P.second == SameValuesCounter.
front().second;
17120 Candidates.resize(SameValuesCounter.
size());
17121 transform(SameValuesCounter, Candidates.begin(),
17122 [](
const auto &
P) { return P.first; });
17123 NumReducedVals = Candidates.size();
17125 if (NumReducedVals == 1) {
17126 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17127 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
17129 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17130 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17131 VectorizedVals.try_emplace(OrigV, Cnt);
17136 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
17137 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
17141 unsigned ReduxWidth = std::min<unsigned>(
17143 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17144 RegMaxNumber * RedValsMaxNumber));
17145 unsigned Start = 0;
17146 unsigned Pos = Start;
17148 unsigned PrevReduxWidth = ReduxWidth;
17149 bool CheckForReusedReductionOpsLocal =
false;
17150 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17151 &CheckForReusedReductionOpsLocal,
17152 &PrevReduxWidth, &
V,
17153 &IgnoreList](
bool IgnoreVL =
false) {
17154 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
17155 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17158 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17161 if (Pos < NumReducedVals - ReduxWidth + 1)
17162 return IsAnyRedOpGathered;
17165 return IsAnyRedOpGathered;
17167 bool AnyVectorized =
false;
17168 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17169 ReduxWidth >= ReductionLimit) {
17172 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17174 CheckForReusedReductionOps =
true;
17177 PrevReduxWidth = ReduxWidth;
17180 if (
V.areAnalyzedReductionVals(VL)) {
17181 (void)AdjustReducedVals(
true);
17187 auto *RedValI = dyn_cast<Instruction>(RedVal);
17190 return V.isDeleted(RedValI);
17193 V.buildTree(VL, IgnoreList);
17194 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
17195 if (!AdjustReducedVals())
17196 V.analyzedReductionVals(VL);
17199 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
17200 if (!AdjustReducedVals())
17201 V.analyzedReductionVals(VL);
17204 V.reorderTopToBottom();
17206 V.reorderBottomToTop(
true);
17210 ExternallyUsedValues);
17211 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
17212 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
17214 for (
Value *V : ReducedVals[Cnt])
17215 if (isa<Instruction>(V))
17216 LocalExternallyUsedValues[TrackedVals[
V]];
17218 if (!IsSupportedHorRdxIdentityOp) {
17221 "Reused values counter map is not empty");
17222 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17223 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17225 Value *
V = Candidates[Cnt];
17226 Value *OrigV = TrackedToOrig.find(V)->second;
17227 ++SameValuesCounter[OrigV];
17233 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17234 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17236 Value *RdxVal = Candidates[Cnt];
17237 if (!Visited.
insert(RdxVal).second)
17241 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
17242 LocalExternallyUsedValues[RdxVal];
17245 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17247 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17248 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
17249 LocalExternallyUsedValues[RdxVal];
17252 if (!IsSupportedHorRdxIdentityOp)
17253 SameValuesCounter.
clear();
17254 for (
Value *RdxVal : VL)
17255 if (RequiredExtract.
contains(RdxVal))
17256 LocalExternallyUsedValues[RdxVal];
17260 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17261 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
17262 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17264 auto RIt = ReplacementToExternal.
find(Ext);
17265 while (RIt != ReplacementToExternal.
end()) {
17267 RIt = ReplacementToExternal.
find(Ext);
17269 auto *It = ExternallyUsedValues.
find(Ext);
17270 if (It == ExternallyUsedValues.
end())
17272 LocalExternallyUsedValues[Pair.second].append(It->second);
17274 V.buildExternalUses(LocalExternallyUsedValues);
17276 V.computeMinimumValueSizes();
17277 V.transformNodes();
17282 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17285 <<
" for reduction\n");
17289 V.getORE()->emit([&]() {
17291 SV_NAME,
"HorSLPNotBeneficial",
17292 ReducedValsToOps.
find(VL[0])->second.front())
17293 <<
"Vectorizing horizontal reduction is possible "
17294 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
17295 <<
" and threshold "
17298 if (!AdjustReducedVals())
17299 V.analyzedReductionVals(VL);
17303 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
17304 <<
Cost <<
". (HorRdx)\n");
17305 V.getORE()->emit([&]() {
17307 SV_NAME,
"VectorizedHorizontalReduction",
17308 ReducedValsToOps.
find(VL[0])->second.front())
17309 <<
"Vectorized horizontal reduction with cost "
17310 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
17311 <<
ore::NV(
"TreeSize",
V.getTreeSize());
17318 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17320 if (IsCmpSelMinMax)
17321 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17324 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
17325 ReplacedExternals, InsertPt);
17332 if ((isBoolLogicOp(RdxRootInst) ||
17333 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17335 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
17338 if (OptReusedScalars && !SameScaleFactor) {
17340 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
17341 SameValuesCounter, TrackedToOrig);
17344 Value *ReducedSubTree =
17345 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
17346 if (ReducedSubTree->
getType() != VL.front()->getType()) {
17348 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
17350 R, cast<Instruction>(ReductionOps.front().front())
17352 ->getDataLayout());
17360 if (OptReusedScalars && SameScaleFactor)
17361 ReducedSubTree = emitScaleForReusedOps(
17362 ReducedSubTree, Builder, SameValuesCounter.
front().second);
17364 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17366 for (
Value *RdxVal : VL) {
17367 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17368 if (IsSupportedHorRdxIdentityOp) {
17369 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17372 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17373 if (!
V.isVectorized(RdxVal))
17374 RequiredExtract.
insert(RdxVal);
17379 AnyVectorized =
true;
17381 if (OptReusedScalars && !AnyVectorized) {
17382 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
17383 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
17384 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17385 Value *OrigV = TrackedToOrig.find(
P.first)->second;
17386 VectorizedVals.try_emplace(OrigV,
P.second);
17391 if (VectorizedTree) {
17412 if (!AnyBoolLogicOp)
17414 if (isBoolLogicOp(RedOp1) &&
17415 ((!InitStep &&
LHS == VectorizedTree) ||
17418 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
17419 getRdxOperand(RedOp2, 0) ==
RHS ||
17424 if (
LHS != VectorizedTree)
17435 unsigned Sz = InstVals.
size();
17438 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
17441 Value *RdxVal1 = InstVals[
I].second;
17442 Value *StableRdxVal1 = RdxVal1;
17443 auto It1 = TrackedVals.find(RdxVal1);
17444 if (It1 != TrackedVals.end())
17445 StableRdxVal1 = It1->second;
17446 Value *RdxVal2 = InstVals[
I + 1].second;
17447 Value *StableRdxVal2 = RdxVal2;
17448 auto It2 = TrackedVals.find(RdxVal2);
17449 if (It2 != TrackedVals.end())
17450 StableRdxVal2 = It2->second;
17454 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
17456 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17457 StableRdxVal2,
"op.rdx", ReductionOps);
17458 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
17461 ExtraReds[Sz / 2] = InstVals.
back();
17465 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
17469 for (
Value *RdxVal : Candidates) {
17470 if (!Visited.
insert(RdxVal).second)
17472 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17479 for (
auto &Pair : ExternallyUsedValues) {
17481 for (
auto *
I : Pair.second)
17485 bool InitStep =
true;
17486 while (ExtraReductions.
size() > 1) {
17488 FinalGen(ExtraReductions, InitStep);
17489 ExtraReductions.
swap(NewReds);
17492 VectorizedTree = ExtraReductions.
front().second;
17494 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17503 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
17510 for (
auto *U :
Ignore->users()) {
17512 "All users must be either in the reduction ops list.");
17515 if (!
Ignore->use_empty()) {
17517 Ignore->replaceAllUsesWith(Undef);
17519 V.eraseInstruction(cast<Instruction>(
Ignore));
17522 }
else if (!CheckForReusedReductionOps) {
17523 for (ReductionOpsType &RdxOps : ReductionOps)
17524 for (
Value *RdxOp : RdxOps)
17525 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17527 return VectorizedTree;
17534 bool IsCmpSelMinMax,
unsigned ReduxWidth,
17537 Type *ScalarTy = ReducedVals.
front()->getType();
17546 int Cnt = ReducedVals.
size();
17547 for (
Value *RdxVal : ReducedVals) {
17552 Cost += GenCostFn();
17557 auto *RdxOp = cast<Instruction>(U);
17558 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17566 Cost += ScalarCost;
17568 Cost += GenCostFn();
17573 case RecurKind::Add:
17574 case RecurKind::Mul:
17575 case RecurKind::Or:
17576 case RecurKind::And:
17577 case RecurKind::Xor:
17578 case RecurKind::FAdd:
17579 case RecurKind::FMul: {
17584 ScalarCost = EvaluateScalarCost([&]() {
17589 case RecurKind::FMax:
17590 case RecurKind::FMin:
17591 case RecurKind::FMaximum:
17592 case RecurKind::FMinimum:
17593 case RecurKind::SMax:
17594 case RecurKind::SMin:
17595 case RecurKind::UMax:
17596 case RecurKind::UMin: {
17600 ScalarCost = EvaluateScalarCost([&]() {
17610 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
17612 <<
" (It is a splitting reduction)\n");
17613 return VectorCost - ScalarCost;
17619 assert(VectorizedValue &&
"Need to have a vectorized tree node");
17621 "We only handle power-of-two reductions for now");
17622 assert(RdxKind != RecurKind::FMulAdd &&
17623 "A call to the llvm.fmuladd intrinsic is not handled yet");
17625 ++NumVectorInstructions;
17632 assert(IsSupportedHorRdxIdentityOp &&
17633 "The optimization of matched scalar identity horizontal reductions "
17634 "must be supported.");
17636 case RecurKind::Add: {
17638 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
17640 << VectorizedValue <<
". (HorRdx)\n");
17641 return Builder.
CreateMul(VectorizedValue, Scale);
17643 case RecurKind::Xor: {
17645 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
17646 <<
". (HorRdx)\n");
17649 return VectorizedValue;
17651 case RecurKind::FAdd: {
17653 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
17655 << VectorizedValue <<
". (HorRdx)\n");
17656 return Builder.
CreateFMul(VectorizedValue, Scale);
17658 case RecurKind::And:
17659 case RecurKind::Or:
17660 case RecurKind::SMax:
17661 case RecurKind::SMin:
17662 case RecurKind::UMax:
17663 case RecurKind::UMin:
17664 case RecurKind::FMax:
17665 case RecurKind::FMin:
17666 case RecurKind::FMaximum:
17667 case RecurKind::FMinimum:
17669 return VectorizedValue;
17670 case RecurKind::Mul:
17671 case RecurKind::FMul:
17672 case RecurKind::FMulAdd:
17673 case RecurKind::IAnyOf:
17674 case RecurKind::FAnyOf:
17675 case RecurKind::None:
17687 assert(IsSupportedHorRdxIdentityOp &&
17688 "The optimization of matched scalar identity horizontal reductions "
17689 "must be supported.");
17690 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
17691 if (VTy->getElementType() != VL.
front()->getType()) {
17697 R, cast<Instruction>(ReductionOps.front().front())
17699 ->getDataLayout());
17704 case RecurKind::Add: {
17707 for (
Value *V : VL) {
17708 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17709 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
17713 << VectorizedValue <<
". (HorRdx)\n");
17714 return Builder.
CreateMul(VectorizedValue, Scale);
17716 case RecurKind::And:
17717 case RecurKind::Or:
17720 <<
". (HorRdx)\n");
17721 return VectorizedValue;
17722 case RecurKind::SMax:
17723 case RecurKind::SMin:
17724 case RecurKind::UMax:
17725 case RecurKind::UMin:
17726 case RecurKind::FMax:
17727 case RecurKind::FMin:
17728 case RecurKind::FMaximum:
17729 case RecurKind::FMinimum:
17732 <<
". (HorRdx)\n");
17733 return VectorizedValue;
17734 case RecurKind::Xor: {
17740 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
17742 std::iota(
Mask.begin(),
Mask.end(), 0);
17743 bool NeedShuffle =
false;
17744 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
17746 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17747 if (Cnt % 2 == 0) {
17749 NeedShuffle =
true;
17755 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
17759 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
17760 return VectorizedValue;
17762 case RecurKind::FAdd: {
17765 for (
Value *V : VL) {
17766 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17767 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17770 return Builder.
CreateFMul(VectorizedValue, Scale);
17772 case RecurKind::Mul:
17773 case RecurKind::FMul:
17774 case RecurKind::FMulAdd:
17775 case RecurKind::IAnyOf:
17776 case RecurKind::FAnyOf:
17777 case RecurKind::None:
17787 return HorizontalReduction::getRdxKind(V);
17790 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17791 return cast<FixedVectorType>(IE->getType())->getNumElements();
17793 unsigned AggregateSize = 1;
17794 auto *
IV = cast<InsertValueInst>(InsertInst);
17795 Type *CurrentType =
IV->getType();
17797 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
17798 for (
auto *Elt : ST->elements())
17799 if (Elt != ST->getElementType(0))
17800 return std::nullopt;
17801 AggregateSize *= ST->getNumElements();
17802 CurrentType = ST->getElementType(0);
17803 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17804 AggregateSize *= AT->getNumElements();
17805 CurrentType = AT->getElementType();
17806 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17807 AggregateSize *= VT->getNumElements();
17808 return AggregateSize;
17810 return AggregateSize;
17812 return std::nullopt;
17821 unsigned OperandOffset) {
17824 std::optional<unsigned> OperandIndex =
17828 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17830 BuildVectorOpds, InsertElts, *OperandIndex);
17833 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17834 InsertElts[*OperandIndex] = LastInsertInst;
17836 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
17837 }
while (LastInsertInst !=
nullptr &&
17838 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17861 assert((isa<InsertElementInst>(LastInsertInst) ||
17862 isa<InsertValueInst>(LastInsertInst)) &&
17863 "Expected insertelement or insertvalue instruction!");
17866 "Expected empty result vectors!");
17869 if (!AggregateSize)
17871 BuildVectorOpds.
resize(*AggregateSize);
17872 InsertElts.
resize(*AggregateSize);
17877 if (BuildVectorOpds.
size() >= 2)
17895 auto DominatedReduxValue = [&](
Value *R) {
17896 return isa<Instruction>(R) &&
17897 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
17903 if (
P->getIncomingBlock(0) == ParentBB) {
17904 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17905 }
else if (
P->getIncomingBlock(1) == ParentBB) {
17906 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17909 if (Rdx && DominatedReduxValue(Rdx))
17922 if (
P->getIncomingBlock(0) == BBLatch) {
17923 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17924 }
else if (
P->getIncomingBlock(1) == BBLatch) {
17925 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17928 if (Rdx && DominatedReduxValue(Rdx))
17962 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17963 isa<IntrinsicInst>(Root)) &&
17964 "Expected binop, select, or intrinsic for reduction matching");
17966 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17968 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17970 return dyn_cast<Instruction>(
RHS);
17972 return dyn_cast<Instruction>(
LHS);
17979 Value *Op0 =
nullptr;
17980 Value *Op1 =
nullptr;
17983 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17989 Value *B0 =
nullptr, *B1 =
nullptr;
17994bool SLPVectorizerPass::vectorizeHorReduction(
17999 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
18001 if (Root->
getParent() != BB || isa<PHINode>(Root))
18005 auto SelectRoot = [&]() {
18024 std::queue<std::pair<Instruction *, unsigned>>
Stack;
18025 Stack.emplace(SelectRoot(), 0);
18029 if (
R.isAnalyzedReductionRoot(Inst))
18034 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
18036 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
18038 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
18039 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18046 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18051 while (!
Stack.empty()) {
18054 std::tie(Inst, Level) =
Stack.front();
18059 if (
R.isDeleted(Inst))
18061 if (
Value *VectorizedV = TryToReduce(Inst)) {
18063 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
18065 Stack.emplace(
I, Level);
18070 if (!TryAppendToPostponedInsts(Inst)) {
18081 if (VisitedInstrs.
insert(
Op).second)
18082 if (
auto *
I = dyn_cast<Instruction>(
Op))
18085 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
18086 !
R.isDeleted(
I) &&
I->getParent() == BB)
18087 Stack.emplace(
I, Level);
18096 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
18097 Res |= tryToVectorize(PostponedInsts, R);
18104 for (
Value *V : Insts)
18105 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
18106 Res |= tryToVectorize(Inst, R);
18110bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
18112 if (!
R.canMapToVector(IVI->
getType()))
18120 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
18122 return tryToVectorizeList(BuildVectorOpds, R);
18131 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18135 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
18136 return tryToVectorizeList(BuildVectorInsts, R);
18139template <
typename T>
18144 bool MaxVFOnly,
BoUpSLP &R) {
18145 bool Changed =
false;
18154 auto *SameTypeIt = IncIt;
18155 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
18159 unsigned NumElts = (SameTypeIt - IncIt);
18160 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
18161 << NumElts <<
")\n");
18172 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
18178 auto GetMinNumElements = [&R](
Value *V) {
18179 unsigned EltSize = R.getVectorElementSize(V);
18180 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18182 if (NumElts < GetMinNumElements(*IncIt) &&
18183 (Candidates.
empty() ||
18184 Candidates.
front()->getType() == (*IncIt)->getType())) {
18185 Candidates.
append(IncIt, std::next(IncIt, NumElts));
18189 if (Candidates.
size() > 1 &&
18190 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18191 if (TryToVectorizeHelper(Candidates,
false)) {
18194 }
else if (MaxVFOnly) {
18196 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
18198 auto *SameTypeIt = It;
18199 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
18201 unsigned NumElts = (SameTypeIt - It);
18202 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
18208 Candidates.
clear();
18212 IncIt = SameTypeIt;
18224template <
bool IsCompatibility>
18229 "Expected valid element types only.");
18231 return IsCompatibility;
18232 auto *CI1 = cast<CmpInst>(V);
18233 auto *CI2 = cast<CmpInst>(V2);
18234 if (CI1->getOperand(0)->getType()->getTypeID() <
18236 return !IsCompatibility;
18237 if (CI1->getOperand(0)->getType()->getTypeID() >
18246 if (BasePred1 < BasePred2)
18247 return !IsCompatibility;
18248 if (BasePred1 > BasePred2)
18251 bool CI1Preds = Pred1 == BasePred1;
18252 bool CI2Preds = Pred2 == BasePred1;
18253 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
18254 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
18255 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
18259 return !IsCompatibility;
18262 if (
auto *I1 = dyn_cast<Instruction>(Op1))
18263 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
18264 if (IsCompatibility) {
18265 if (I1->getParent() != I2->getParent())
18272 return NodeI2 !=
nullptr;
18275 assert((NodeI1 == NodeI2) ==
18277 "Different nodes should have different DFS numbers");
18278 if (NodeI1 != NodeI2)
18282 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18284 if (IsCompatibility)
18286 if (I1->getOpcode() != I2->getOpcode())
18287 return I1->getOpcode() < I2->getOpcode();
18290 return IsCompatibility;
18293template <
typename ItT>
18296 bool Changed =
false;
18299 if (
R.isDeleted(
I))
18302 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
18303 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
18307 if (
R.isDeleted(
I))
18309 Changed |= tryToVectorize(
I, R);
18316 return compareCmp<false>(V, V2, *TLI, *DT);
18319 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
18322 return compareCmp<true>(V1, V2, *TLI, *DT);
18329 if (Vals.
size() <= 1)
18331 Changed |= tryToVectorizeSequence<Value>(
18332 Vals, CompareSorter, AreCompatibleCompares,
18335 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
18337 auto *Select = dyn_cast<SelectInst>(U);
18339 Select->getParent() != cast<Instruction>(V)->getParent();
18342 if (ArePossiblyReducedInOtherBlock)
18344 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18350bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18352 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18353 "This function only accepts Insert instructions");
18354 bool OpsChanged =
false;
18357 for (
auto *
I :
reverse(Instructions)) {
18358 if (
R.isDeleted(
I))
18360 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
18363 for (
auto *
I :
reverse(Instructions)) {
18364 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
18366 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
18367 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18368 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
18369 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18373 OpsChanged |= tryToVectorize(PostponedInsts, R);
18380 bool Changed =
false;
18387 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
18390 "Expected vectorizable types only.");
18399 if (Opcodes1.
size() < Opcodes2.
size())
18401 if (Opcodes1.
size() > Opcodes2.
size())
18403 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18406 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
18407 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
18412 return NodeI2 !=
nullptr;
18415 assert((NodeI1 == NodeI2) ==
18417 "Different nodes should have different DFS numbers");
18418 if (NodeI1 != NodeI2)
18421 if (S.getOpcode() && !S.isAltShuffle())
18423 return I1->getOpcode() < I2->getOpcode();
18432 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
18433 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
18441 bool U1 = isa<UndefValue>(Opcodes1[
I]);
18442 bool U2 = isa<UndefValue>(Opcodes2[
I]);
18446 auto ValID1 = Opcodes1[
I]->getValueID();
18447 auto ValID2 = Opcodes2[
I]->getValueID();
18448 if (ValID1 == ValID2)
18450 if (ValID1 < ValID2)
18452 if (ValID1 > ValID2)
18461 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
18466 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
18469 if (V1->getType() !=
V2->getType())
18473 if (Opcodes1.
size() != Opcodes2.
size())
18475 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18477 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
18479 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
18480 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
18481 if (
I1->getParent() != I2->getParent())
18488 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
18490 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
18496 bool HaveVectorizedPhiNodes =
false;
18501 auto *
P = dyn_cast<PHINode>(&
I);
18507 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
18520 if (!Opcodes.
empty())
18524 while (!Nodes.empty()) {
18525 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
18528 for (
Value *V :
PHI->incoming_values()) {
18529 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
18530 Nodes.push_back(PHI1);
18538 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18539 Incoming, PHICompare, AreCompatiblePHIs,
18541 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18544 Changed |= HaveVectorizedPhiNodes;
18546 }
while (HaveVectorizedPhiNodes);
18548 VisitedInstrs.
clear();
18550 InstSetVector PostProcessInserts;
18554 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
18555 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18556 if (VectorizeCmps) {
18557 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
18558 PostProcessCmps.
clear();
18560 PostProcessInserts.clear();
18565 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
18566 return PostProcessCmps.
contains(Cmp);
18567 return isa<InsertElementInst, InsertValueInst>(
I) &&
18568 PostProcessInserts.contains(
I);
18574 return I->use_empty() &&
18575 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
18580 if (isa<ScalableVectorType>(It->getType()))
18584 if (
R.isDeleted(&*It))
18587 if (!VisitedInstrs.
insert(&*It).second) {
18588 if (HasNoUsers(&*It) &&
18589 VectorizeInsertsAndCmps(It->isTerminator())) {
18599 if (isa<DbgInfoIntrinsic>(It))
18603 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
18605 if (
P->getNumIncomingValues() == 2) {
18608 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
18617 for (
unsigned I = 0, E =
P->getNumIncomingValues();
I != E;
I++) {
18622 if (BB ==
P->getIncomingBlock(
I) ||
18628 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
18629 PI && !IsInPostProcessInstrs(PI))
18630 Changed |= vectorizeRootInstruction(
nullptr, PI,
18631 P->getIncomingBlock(
I), R,
TTI);
18636 if (HasNoUsers(&*It)) {
18637 bool OpsChanged =
false;
18638 auto *
SI = dyn_cast<StoreInst>(It);
18648 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
18649 SI->getValueOperand()->hasOneUse();
18651 if (TryToVectorizeRoot) {
18652 for (
auto *V : It->operand_values()) {
18655 if (
auto *VI = dyn_cast<Instruction>(V);
18656 VI && !IsInPostProcessInstrs(VI))
18658 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
18665 VectorizeInsertsAndCmps(It->isTerminator());
18676 if (isa<InsertElementInst, InsertValueInst>(It))
18677 PostProcessInserts.insert(&*It);
18678 else if (isa<CmpInst>(It))
18679 PostProcessCmps.
insert(cast<CmpInst>(&*It));
18686 auto Changed =
false;
18687 for (
auto &Entry : GEPs) {
18690 if (
Entry.second.size() < 2)
18693 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
18694 <<
Entry.second.size() <<
".\n");
18701 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18702 unsigned EltSize =
R.getVectorElementSize(*
Entry.second[0]->idx_begin());
18703 if (MaxVecRegSize < EltSize)
18706 unsigned MaxElts = MaxVecRegSize / EltSize;
18707 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
18708 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18721 Candidates.remove_if([&R](
Value *
I) {
18722 return R.isDeleted(cast<Instruction>(
I)) ||
18723 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
18731 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
18732 auto *GEPI = GEPList[
I];
18733 if (!Candidates.count(GEPI))
18735 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
18736 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
18737 auto *GEPJ = GEPList[J];
18738 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
18739 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
18740 Candidates.remove(GEPI);
18741 Candidates.remove(GEPJ);
18742 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18743 Candidates.remove(GEPJ);
18750 if (Candidates.
size() < 2)
18757 auto BundleIndex = 0
u;
18758 for (
auto *V : Candidates) {
18759 auto *
GEP = cast<GetElementPtrInst>(V);
18760 auto *GEPIdx =
GEP->idx_begin()->get();
18761 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18762 Bundle[BundleIndex++] = GEPIdx;
18774 Changed |= tryToVectorizeList(Bundle, R);
18780bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
18781 bool Changed =
false;
18786 if (
V->getValueOperand()->getType()->getTypeID() <
18787 V2->getValueOperand()->getType()->getTypeID())
18789 if (
V->getValueOperand()->getType()->getTypeID() >
18790 V2->getValueOperand()->getType()->getTypeID())
18792 if (
V->getPointerOperandType()->getTypeID() <
18793 V2->getPointerOperandType()->getTypeID())
18795 if (
V->getPointerOperandType()->getTypeID() >
18796 V2->getPointerOperandType()->getTypeID())
18799 if (isa<UndefValue>(
V->getValueOperand()) ||
18800 isa<UndefValue>(
V2->getValueOperand()))
18802 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
18803 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18807 DT->
getNode(I2->getParent());
18808 assert(NodeI1 &&
"Should only process reachable instructions");
18809 assert(NodeI2 &&
"Should only process reachable instructions");
18810 assert((NodeI1 == NodeI2) ==
18812 "Different nodes should have different DFS numbers");
18813 if (NodeI1 != NodeI2)
18818 return I1->getOpcode() < I2->getOpcode();
18820 if (isa<Constant>(
V->getValueOperand()) &&
18821 isa<Constant>(
V2->getValueOperand()))
18823 return V->getValueOperand()->getValueID() <
18824 V2->getValueOperand()->getValueID();
18836 isa<UndefValue>(
V2->getValueOperand()))
18839 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18840 if (
I1->getParent() != I2->getParent())
18843 return S.getOpcode() > 0;
18846 isa<Constant>(
V2->getValueOperand()))
18849 V2->getValueOperand()->getValueID();
18854 for (
auto &Pair : Stores) {
18855 if (Pair.second.size() < 2)
18859 << Pair.second.size() <<
".\n");
18868 Pair.second.rend());
18869 Changed |= tryToVectorizeSequence<StoreInst>(
18870 ReversedStores, StoreSorter, AreCompatibleStores,
18872 return vectorizeStores(Candidates, R, Attempted);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.