73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
128 cl::desc(
"Attempt to vectorize horizontal reductions"));
133 "Attempt to vectorize horizontal reductions feeding into a store"));
139 cl::desc(
"Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
144 cl::desc(
"Attempt to vectorize for this register size in bits"));
148 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
156 cl::desc(
"Limit the size of the SLP scheduling region per block"));
160 cl::desc(
"Attempt to vectorize for this register size in bits"));
164 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
168 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
174 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
183 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
187 cl::desc(
"The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
192 cl::desc(
"The maximum stride, considered to be profitable."));
196 cl::desc(
"Display the SLP trees with Graphviz"));
200 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
227 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
244 auto *
I = dyn_cast<Instruction>(V);
245 if (!
I || isa<ExtractValueInst>(
I))
247 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
249 if (isa<ExtractElementInst>(
I))
251 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
260 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
276 for (
int I = 1, E = VL.
size();
I < E;
I++) {
277 auto *II = dyn_cast<Instruction>(VL[
I]);
298 Value *FirstNonUndef =
nullptr;
299 for (
Value *V : VL) {
300 if (isa<UndefValue>(V))
302 if (!FirstNonUndef) {
306 if (V != FirstNonUndef)
309 return FirstNonUndef !=
nullptr;
314 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
315 return Cmp->isCommutative();
316 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
337 (BO->getOpcode() == Instruction::FSub &&
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
343 return I->isCommutative();
351 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
358 if (CI->getValue().uge(VT->getNumElements()))
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
365 const auto *
IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType =
IV->getType();
367 for (
unsigned I :
IV->indices()) {
368 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(
I);
371 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
406 if (MaskArg == UseMask::UndefsAsMask)
410 if (MaskArg == UseMask::FirstArg &&
Value < VF)
411 UseMask.reset(
Value);
412 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
413 UseMask.reset(
Value - VF);
421template <
bool IsPoisonOnly = false>
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
428 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
431 auto *
C = dyn_cast<Constant>(V);
433 if (!UseMask.empty()) {
435 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
444 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
452 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
459 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
460 if (
Constant *Elem =
C->getAggregateElement(
I))
462 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 =
nullptr;
501 Value *Vec2 =
nullptr;
503 ShuffleMode CommonShuffleMode =
Unknown;
505 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
507 if (isa<UndefValue>(VL[
I]))
509 auto *EI = cast<ExtractElementInst>(VL[
I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
512 auto *Vec = EI->getVectorOperand();
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
519 if (isa<UndefValue>(EI->getIndexOperand()))
521 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
527 unsigned IntIdx =
Idx->getValue().getZExtValue();
531 if (!Vec1 || Vec1 == Vec) {
533 }
else if (!Vec2 || Vec2 == Vec) {
539 if (CommonShuffleMode == Permute)
544 CommonShuffleMode = Permute;
547 CommonShuffleMode =
Select;
550 if (CommonShuffleMode ==
Select && Vec2)
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
568 return CI->getZExtValue();
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
573 return *EI->idx_begin();
579struct InstructionsState {
581 Value *OpValue =
nullptr;
592 unsigned getAltOpcode()
const {
597 bool isAltShuffle()
const {
return AltOp != MainOp; }
600 unsigned CheckedOpcode =
I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
604 InstructionsState() =
delete;
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
615 auto *
I = dyn_cast<Instruction>(
Op);
616 if (
I && S.isOpcodeOrAlt(
I))
635 unsigned BaseIndex = 0);
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
656 "Assessing comparisons of different types?");
666 return (BasePred == Pred &&
668 (BasePred == SwappedPred &&
677 unsigned BaseIndex) {
680 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
692 bool SwappedPredsCompatible = [&]() {
696 UniquePreds.
insert(BasePred);
697 UniqueNonSwappedPreds.
insert(BasePred);
698 for (
Value *V : VL) {
699 auto *
I = dyn_cast<CmpInst>(V);
705 UniqueNonSwappedPreds.
insert(CurrentPred);
706 if (!UniquePreds.
contains(CurrentPred) &&
707 !UniquePreds.
contains(SwappedCurrentPred))
708 UniquePreds.
insert(CurrentPred);
713 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
720 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
724 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
726 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
727 auto *
I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode =
I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(
I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
734 AltOpcode = InstOpcode;
738 }
else if (IsCastOp && isa<CastInst>(
I)) {
739 Value *Op0 = IBase->getOperand(0);
741 Value *Op1 =
I->getOperand(0);
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
746 if (Opcode == AltOpcode) {
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
755 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
760 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
777 }
else if (BasePred != CurrentPred) {
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
789 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
794 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
796 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
797 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
801 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
802 auto *
CallBase = cast<CallInst>(IBase);
804 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
810 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
813 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
816 if (Mappings.
size() != BaseMappings.
size() ||
817 Mappings.
front().ISA != BaseMappings.
front().ISA ||
818 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
819 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
820 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
821 Mappings.
front().Shape.Parameters !=
822 BaseMappings.
front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
828 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
873 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
880 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
881 return LI->isSimple();
883 return SI->isSimple();
885 return !
MI->isVolatile();
893 bool ExtendingManyInputs =
false) {
897 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
899 (SubMask.
size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
904 Mask.append(SubMask.
begin(), SubMask.
end());
908 int TermValue = std::min(Mask.size(), SubMask.
size());
909 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
911 (!ExtendingManyInputs &&
912 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
914 NewMask[
I] = Mask[SubMask[
I]];
930 const unsigned Sz = Order.
size();
933 for (
unsigned I = 0;
I < Sz; ++
I) {
935 UnusedIndices.
reset(Order[
I]);
937 MaskedIndices.
set(
I);
939 if (MaskedIndices.
none())
942 "Non-synced masked/available indices.");
946 assert(
Idx >= 0 &&
"Indices must be synced.");
958 const unsigned E = Indices.
size();
960 for (
unsigned I = 0;
I < E; ++
I)
961 Mask[Indices[
I]] =
I;
967 assert(!Mask.empty() &&
"Expected non-empty mask.");
971 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
973 Scalars[Mask[
I]] = Prev[
I];
981 auto *
I = dyn_cast<Instruction>(V);
986 auto *IO = dyn_cast<Instruction>(V);
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
998 auto *
I = dyn_cast<Instruction>(V);
1002 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1004 auto *IU = dyn_cast<Instruction>(U);
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1023 return !VL.
empty() &&
1027namespace slpvectorizer {
1032 struct ScheduleData;
1057 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1112 return !VectorizableTree.
empty() &&
1113 !VectorizableTree.
front()->UserTreeIndices.empty();
1118 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1119 return VectorizableTree.
front()->Scalars;
1134 VectorizableTree.
clear();
1135 ScalarToTreeEntry.clear();
1136 MultiNodeScalars.clear();
1138 NonScheduledFirst.
clear();
1139 EntryToLastInstruction.clear();
1140 ExternalUses.
clear();
1141 ExternalUsesAsGEPs.clear();
1142 for (
auto &Iter : BlocksSchedules) {
1143 BlockScheduling *BS = Iter.second.get();
1147 ReductionBitWidth = 0;
1148 CastMaxMinBWSizes.reset();
1149 ExtraBitWidthNodes.
clear();
1150 InstrElementSize.clear();
1151 UserIgnoreList =
nullptr;
1152 PostponedGathers.
clear();
1153 ValueToGatherNodes.
clear();
1210 return MaxVecRegSize;
1215 return MinVecRegSize;
1223 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1225 return MaxVF ? MaxVF : UINT_MAX;
1269 bool TryRecursiveCheck =
true)
const;
1293 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1294 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1316 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1317 MaxLevel(MaxLevel) {}
1371 if (isa<LoadInst>(V1)) {
1373 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1378 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1380 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1383 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1386 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1388 ((
int)V1->getNumUses() == NumLanes ||
1389 AllUsersAreInternal(V1, V2)))
1395 auto *LI1 = dyn_cast<LoadInst>(V1);
1396 auto *LI2 = dyn_cast<LoadInst>(V2);
1398 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1403 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1404 LI2->getPointerOperand(),
DL, SE,
true);
1405 if (!Dist || *Dist == 0) {
1408 R.TTI->isLegalMaskedGather(
1416 if (std::abs(*Dist) > NumLanes / 2)
1425 auto *C1 = dyn_cast<Constant>(V1);
1426 auto *C2 = dyn_cast<Constant>(V2);
1440 if (isa<UndefValue>(V2))
1444 Value *EV2 =
nullptr;
1457 int Dist = Idx2 - Idx1;
1460 if (std::abs(Dist) == 0)
1462 if (std::abs(Dist) > NumLanes / 2)
1472 auto *I1 = dyn_cast<Instruction>(V1);
1473 auto *I2 = dyn_cast<Instruction>(V2);
1475 if (I1->getParent() != I2->getParent())
1483 if (S.getOpcode() &&
1484 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1485 !S.isAltShuffle()) &&
1487 return cast<Instruction>(V)->getNumOperands() ==
1488 S.MainOp->getNumOperands();
1494 if (isa<UndefValue>(V2))
1531 int ShallowScoreAtThisLevel =
1540 auto *I1 = dyn_cast<Instruction>(
LHS);
1541 auto *I2 = dyn_cast<Instruction>(
RHS);
1542 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1544 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1545 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1546 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1547 ShallowScoreAtThisLevel))
1548 return ShallowScoreAtThisLevel;
1549 assert(I1 && I2 &&
"Should have early exited.");
1556 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1557 OpIdx1 != NumOperands1; ++OpIdx1) {
1559 int MaxTmpScore = 0;
1560 unsigned MaxOpIdx2 = 0;
1561 bool FoundBest =
false;
1565 ? I2->getNumOperands()
1566 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1567 assert(FromIdx <= ToIdx &&
"Bad index");
1568 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1570 if (Op2Used.
count(OpIdx2))
1575 I1, I2, CurrLevel + 1, std::nullopt);
1578 TmpScore > MaxTmpScore) {
1579 MaxTmpScore = TmpScore;
1586 Op2Used.
insert(MaxOpIdx2);
1587 ShallowScoreAtThisLevel += MaxTmpScore;
1590 return ShallowScoreAtThisLevel;
1621 struct OperandData {
1622 OperandData() =
default;
1623 OperandData(
Value *V,
bool APO,
bool IsUsed)
1624 : V(V), APO(APO), IsUsed(IsUsed) {}
1634 bool IsUsed =
false;
1643 enum class ReorderingMode {
1662 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1663 return OpsVec[OpIdx][Lane];
1667 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1668 return OpsVec[OpIdx][Lane];
1673 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1674 OpIdx != NumOperands; ++OpIdx)
1675 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1677 OpsVec[OpIdx][Lane].IsUsed =
false;
1681 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1682 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1694 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1695 Value *IdxLaneV = getData(
Idx, Lane).V;
1696 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1699 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1702 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1703 if (!isa<Instruction>(OpIdxLnV))
1705 Uniques.
insert(OpIdxLnV);
1707 int UniquesCount = Uniques.
size();
1708 int UniquesCntWithIdxLaneV =
1709 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1710 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1711 int UniquesCntWithOpIdxLaneV =
1712 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1713 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1716 UniquesCntWithOpIdxLaneV) -
1717 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1726 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1727 Value *IdxLaneV = getData(
Idx, Lane).V;
1728 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1737 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1738 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1740 return R.areAllUsersVectorized(IdxLaneI)
1748 static const int ScoreScaleFactor = 10;
1756 int Lane,
unsigned OpIdx,
unsigned Idx,
1766 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1767 if (Score <= -SplatScore) {
1772 Score += SplatScore;
1778 Score *= ScoreScaleFactor;
1779 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1797 std::optional<unsigned>
1798 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1801 unsigned NumOperands = getNumOperands();
1804 Value *OpLastLane = getData(OpIdx, LastLane).V;
1807 ReorderingMode RMode = ReorderingModes[OpIdx];
1808 if (RMode == ReorderingMode::Failed)
1809 return std::nullopt;
1812 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1818 std::optional<unsigned>
Idx;
1822 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1829 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1831 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1833 OperandData &OpData = getData(
Idx, Lane);
1835 bool OpAPO = OpData.APO;
1844 if (OpAPO != OpIdxAPO)
1849 case ReorderingMode::Load:
1850 case ReorderingMode::Constant:
1851 case ReorderingMode::Opcode: {
1852 bool LeftToRight = Lane > LastLane;
1853 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1854 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1855 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1856 OpIdx,
Idx, IsUsed);
1857 if (Score >
static_cast<int>(BestOp.Score)) {
1859 BestOp.Score = Score;
1860 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1864 case ReorderingMode::Splat:
1865 if (
Op == OpLastLane)
1868 case ReorderingMode::Failed:
1874 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1878 return std::nullopt;
1885 unsigned getBestLaneToStartReordering()
const {
1886 unsigned Min = UINT_MAX;
1887 unsigned SameOpNumber = 0;
1898 for (
int I = getNumLanes();
I > 0; --
I) {
1899 unsigned Lane =
I - 1;
1900 OperandsOrderData NumFreeOpsHash =
1901 getMaxNumOperandsThatCanBeReordered(Lane);
1904 if (NumFreeOpsHash.NumOfAPOs < Min) {
1905 Min = NumFreeOpsHash.NumOfAPOs;
1906 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1908 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1909 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1910 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1913 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1914 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1915 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1916 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1917 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1918 if (It == HashMap.
end())
1919 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1925 unsigned BestLane = 0;
1926 unsigned CntMin = UINT_MAX;
1928 if (
Data.second.first < CntMin) {
1929 CntMin =
Data.second.first;
1930 BestLane =
Data.second.second;
1937 struct OperandsOrderData {
1940 unsigned NumOfAPOs = UINT_MAX;
1943 unsigned NumOpsWithSameOpcodeParent = 0;
1957 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1958 unsigned CntTrue = 0;
1959 unsigned NumOperands = getNumOperands();
1969 bool AllUndefs =
true;
1970 unsigned NumOpsWithSameOpcodeParent = 0;
1974 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1975 const OperandData &OpData = getData(OpIdx, Lane);
1980 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1982 I->getParent() != Parent) {
1983 if (NumOpsWithSameOpcodeParent == 0) {
1984 NumOpsWithSameOpcodeParent = 1;
1986 Parent =
I->getParent();
1988 --NumOpsWithSameOpcodeParent;
1991 ++NumOpsWithSameOpcodeParent;
1995 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1996 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2000 OperandsOrderData
Data;
2001 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2002 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2010 assert((empty() || VL.
size() == getNumLanes()) &&
2011 "Expected same number of lanes");
2012 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2013 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2014 constexpr unsigned IntrinsicNumOperands = 2;
2015 if (isa<IntrinsicInst>(VL[0]))
2016 NumOperands = IntrinsicNumOperands;
2017 OpsVec.
resize(NumOperands);
2018 unsigned NumLanes = VL.
size();
2019 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2020 OpsVec[OpIdx].
resize(NumLanes);
2021 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2022 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2033 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2034 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2035 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2042 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2045 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2048 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2049 return getData(OpIdx, Lane).V;
2053 bool empty()
const {
return OpsVec.
empty(); }
2056 void clear() { OpsVec.
clear(); }
2061 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2062 bool OpAPO = getData(OpIdx, Lane).APO;
2063 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2067 bool FoundCandidate =
false;
2068 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2069 OperandData &
Data = getData(OpI, Ln);
2070 if (
Data.APO != OpAPO ||
Data.IsUsed)
2073 FoundCandidate =
true;
2078 if (!FoundCandidate)
2087 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R) {
2089 appendOperandsOfVL(RootVL);
2096 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2097 "Expected same num of lanes across all operands");
2098 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2099 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2107 unsigned NumOperands = getNumOperands();
2108 unsigned NumLanes = getNumLanes();
2128 unsigned FirstLane = getBestLaneToStartReordering();
2131 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2132 Value *OpLane0 = getValue(OpIdx, FirstLane);
2135 if (isa<LoadInst>(OpLane0))
2136 ReorderingModes[OpIdx] = ReorderingMode::Load;
2137 else if (isa<Instruction>(OpLane0)) {
2139 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2140 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2142 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2144 else if (isa<Constant>(OpLane0))
2145 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2146 else if (isa<Argument>(OpLane0))
2148 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2151 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2158 auto &&SkipReordering = [
this]() {
2161 for (
const OperandData &
Data : Op0)
2164 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2183 if (SkipReordering())
2186 bool StrategyFailed =
false;
2194 for (
unsigned I = 0;
I < NumOperands; ++
I)
2195 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2197 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2200 int Lane = FirstLane +
Direction * Distance;
2201 if (Lane < 0 || Lane >= (
int)NumLanes)
2204 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2207 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2209 std::optional<unsigned> BestIdx = getBestOperand(
2210 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2217 swap(OpIdx, *BestIdx, Lane);
2220 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2222 StrategyFailed =
true;
2225 if (MainAltOps[OpIdx].
size() != 2) {
2226 OperandData &AltOp = getData(OpIdx, Lane);
2227 InstructionsState OpS =
2229 if (OpS.getOpcode() && OpS.isAltShuffle())
2236 if (!StrategyFailed)
2241#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2244 case ReorderingMode::Load:
2246 case ReorderingMode::Opcode:
2248 case ReorderingMode::Constant:
2250 case ReorderingMode::Splat:
2252 case ReorderingMode::Failed:
2273 const unsigned Indent = 2;
2276 OS <<
"Operand " << Cnt++ <<
"\n";
2277 for (
const OperandData &OpData : OpDataVec) {
2279 if (
Value *V = OpData.V)
2283 OS <<
", APO:" << OpData.APO <<
"}\n";
2305 int BestScore = Limit;
2306 std::optional<int>
Index;
2307 for (
int I : seq<int>(0, Candidates.size())) {
2309 Candidates[
I].second,
2312 if (Score > BestScore) {
2327 DeletedInstructions.insert(
I);
2333 return AnalyzedReductionsRoots.count(
I);
2338 AnalyzedReductionsRoots.insert(
I);
2352 AnalyzedReductionsRoots.clear();
2353 AnalyzedReductionVals.
clear();
2354 AnalyzedMinBWVals.
clear();
2366 return NonScheduledFirst.
contains(V);
2379 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2383 unsigned &MaxDepthLevel,
2384 bool &IsProfitableToDemote,
2385 bool IsTruncRoot)
const;
2395 canReorderOperands(TreeEntry *UserTE,
2402 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2406 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2408 TreeEntry *TE =
nullptr;
2410 TE = getTreeEntry(V);
2411 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2413 auto It = MultiNodeScalars.find(V);
2414 if (It != MultiNodeScalars.end()) {
2415 for (TreeEntry *E : It->second) {
2416 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2424 if (It != VL.
end()) {
2425 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2433 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2434 unsigned OpIdx)
const {
2435 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2436 const_cast<TreeEntry *
>(UserTE), OpIdx);
2440 bool areAllUsersVectorized(
2449 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2453 getCastContextHint(
const TreeEntry &TE)
const;
2462 const EdgeInfo &EI);
2473 bool ResizeAllowed =
false)
const;
2484 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2489 template <
typename BVTy,
typename ResTy,
typename...
Args>
2490 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2495 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy);
2501 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2508 std::optional<TargetTransformInfo::ShuffleKind>
2520 unsigned NumParts)
const;
2532 std::optional<TargetTransformInfo::ShuffleKind>
2533 isGatherShuffledSingleRegisterEntry(
2550 isGatherShuffledEntry(
2553 unsigned NumParts,
bool ForOrder =
false);
2560 Type *ScalarTy)
const;
2564 void setInsertPointAfterBundle(
const TreeEntry *E);
2572 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2585 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2601 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2605 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2622 [Scalars](
Value *V,
int Idx) {
2623 return (isa<UndefValue>(V) &&
2624 Idx == PoisonMaskElem) ||
2625 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2628 if (!ReorderIndices.empty()) {
2635 return IsSame(Scalars, Mask);
2636 if (VL.
size() == ReuseShuffleIndices.size()) {
2638 return IsSame(Scalars, Mask);
2642 return IsSame(Scalars, ReuseShuffleIndices);
2645 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2646 return State == TreeEntry::NeedToGather &&
2647 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2648 UserTreeIndices.front().UserTE == UserEI.UserTE;
2652 bool hasEqualOperands(
const TreeEntry &TE)
const {
2653 if (
TE.getNumOperands() != getNumOperands())
2656 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2657 unsigned PrevCount =
Used.count();
2658 for (
unsigned K = 0;
K < E; ++
K) {
2661 if (getOperand(K) ==
TE.getOperand(
I)) {
2667 if (PrevCount ==
Used.count())
2676 unsigned getVectorFactor()
const {
2677 if (!ReuseShuffleIndices.empty())
2678 return ReuseShuffleIndices.size();
2679 return Scalars.
size();
2714 VecTreeTy &Container;
2738 assert(Operands[OpIdx].empty() &&
"Already resized?");
2740 "Number of operands is greater than the number of scalars.");
2746 void setOperandsInOrder() {
2748 auto *I0 = cast<Instruction>(Scalars[0]);
2749 Operands.resize(I0->getNumOperands());
2750 unsigned NumLanes = Scalars.size();
2751 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2752 OpIdx != NumOperands; ++OpIdx) {
2754 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2755 auto *
I = cast<Instruction>(Scalars[Lane]);
2756 assert(
I->getNumOperands() == NumOperands &&
2757 "Expected same number of operands");
2758 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2782 unsigned getNumOperands()
const {
return Operands.size(); }
2785 Value *getSingleOperand(
unsigned OpIdx)
const {
2787 assert(!Operands[OpIdx].empty() &&
"No operand available");
2792 bool isAltShuffle()
const {
return MainOp != AltOp; }
2795 unsigned CheckedOpcode =
I->getOpcode();
2796 return (getOpcode() == CheckedOpcode ||
2797 getAltOpcode() == CheckedOpcode);
2804 auto *
I = dyn_cast<Instruction>(
Op);
2805 if (
I && isOpcodeOrAlt(
I))
2810 void setOperations(
const InstructionsState &S) {
2824 unsigned getOpcode()
const {
2825 return MainOp ? MainOp->
getOpcode() : 0;
2828 unsigned getAltOpcode()
const {
2834 int findLaneForValue(
Value *V)
const {
2835 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2836 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2837 if (!ReorderIndices.
empty())
2838 FoundLane = ReorderIndices[FoundLane];
2839 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2840 if (!ReuseShuffleIndices.
empty()) {
2841 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2842 find(ReuseShuffleIndices, FoundLane));
2856 bool isNonPowOf2Vec()
const {
2858 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
2859 "Reshuffling not supported with non-power-of-2 vectors yet.");
2860 return IsNonPowerOf2;
2867 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2868 dbgs() <<
"Operand " << OpI <<
":\n";
2869 for (
const Value *V : Operands[OpI])
2872 dbgs() <<
"Scalars: \n";
2873 for (
Value *V : Scalars)
2875 dbgs() <<
"State: ";
2878 dbgs() <<
"Vectorize\n";
2880 case ScatterVectorize:
2881 dbgs() <<
"ScatterVectorize\n";
2883 case StridedVectorize:
2884 dbgs() <<
"StridedVectorize\n";
2887 dbgs() <<
"NeedToGather\n";
2890 dbgs() <<
"MainOp: ";
2892 dbgs() << *MainOp <<
"\n";
2895 dbgs() <<
"AltOp: ";
2897 dbgs() << *AltOp <<
"\n";
2900 dbgs() <<
"VectorizedValue: ";
2901 if (VectorizedValue)
2902 dbgs() << *VectorizedValue <<
"\n";
2905 dbgs() <<
"ReuseShuffleIndices: ";
2906 if (ReuseShuffleIndices.
empty())
2909 for (
int ReuseIdx : ReuseShuffleIndices)
2910 dbgs() << ReuseIdx <<
", ";
2912 dbgs() <<
"ReorderIndices: ";
2913 for (
unsigned ReorderIdx : ReorderIndices)
2914 dbgs() << ReorderIdx <<
", ";
2916 dbgs() <<
"UserTreeIndices: ";
2917 for (
const auto &EInfo : UserTreeIndices)
2918 dbgs() << EInfo <<
", ";
2925 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
2928 dbgs() <<
"SLP: " << Banner <<
":\n";
2930 dbgs() <<
"SLP: Costs:\n";
2931 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2932 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2933 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2934 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2935 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2941 std::optional<ScheduleData *> Bundle,
2942 const InstructionsState &S,
2943 const EdgeInfo &UserTreeIdx,
2946 TreeEntry::EntryState EntryState =
2947 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2948 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2949 ReuseShuffleIndices, ReorderIndices);
2953 TreeEntry::EntryState EntryState,
2954 std::optional<ScheduleData *> Bundle,
2955 const InstructionsState &S,
2956 const EdgeInfo &UserTreeIdx,
2959 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2960 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2961 "Need to vectorize gather entry?");
2962 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2963 TreeEntry *
Last = VectorizableTree.
back().get();
2964 Last->Idx = VectorizableTree.
size() - 1;
2965 Last->State = EntryState;
2966 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2967 ReuseShuffleIndices.end());
2968 if (ReorderIndices.
empty()) {
2970 Last->setOperations(S);
2973 Last->Scalars.assign(VL.
size(),
nullptr);
2976 if (Idx >= VL.size())
2977 return UndefValue::get(VL.front()->getType());
2981 Last->setOperations(S);
2982 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2984 if (
Last->State != TreeEntry::NeedToGather) {
2985 for (
Value *V : VL) {
2986 const TreeEntry *
TE = getTreeEntry(V);
2988 "Scalar already in tree!");
2991 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
2994 ScalarToTreeEntry[
V] =
Last;
2997 ScheduleData *BundleMember = *Bundle;
2998 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3001 "Bundle and VL out of sync");
3003 for (
Value *V : VL) {
3008 BundleMember->TE =
Last;
3009 BundleMember = BundleMember->NextInBundle;
3012 assert(!BundleMember &&
"Bundle and VL out of sync");
3015 bool AllConstsOrCasts =
true;
3018 auto *
I = dyn_cast<CastInst>(V);
3019 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3022 if (AllConstsOrCasts)
3024 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3025 MustGather.
insert(VL.begin(), VL.end());
3028 if (UserTreeIdx.UserTE) {
3029 Last->UserTreeIndices.push_back(UserTreeIdx);
3030 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3031 "Reordering isn't implemented for non-power-of-2 nodes yet");
3038 TreeEntry::VecTreeTy VectorizableTree;
3043 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3044 VectorizableTree[
Id]->dump();
3050 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3052 const TreeEntry *getTreeEntry(
Value *V)
const {
3053 return ScalarToTreeEntry.lookup(V);
3062 bool areAltOperandsProfitable(
const InstructionsState &S,
3067 TreeEntry::EntryState getScalarsVectorizationState(
3100 using ValueToGatherNodesMap =
3102 ValueToGatherNodesMap ValueToGatherNodes;
3105 struct ExternalUser {
3129 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3130 auto It = AliasCache.
find(Key);
3131 if (It != AliasCache.
end())
3136 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3140 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3172 UserList ExternalUses;
3192 struct ScheduleData {
3195 enum { InvalidDeps = -1 };
3197 ScheduleData() =
default;
3199 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3200 FirstInBundle =
this;
3201 NextInBundle =
nullptr;
3202 NextLoadStore =
nullptr;
3203 IsScheduled =
false;
3204 SchedulingRegionID = BlockSchedulingRegionID;
3205 clearDependencies();
3212 if (hasValidDependencies()) {
3213 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3215 assert(UnscheduledDeps == Dependencies &&
"invariant");
3219 assert(isSchedulingEntity() &&
3220 "unexpected scheduled state");
3221 for (
const ScheduleData *BundleMember =
this; BundleMember;
3222 BundleMember = BundleMember->NextInBundle) {
3223 assert(BundleMember->hasValidDependencies() &&
3224 BundleMember->UnscheduledDeps == 0 &&
3225 "unexpected scheduled state");
3226 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3227 "only bundle is marked scheduled");
3231 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3232 "all bundle members must be in same basic block");
3238 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3242 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3246 bool isPartOfBundle()
const {
3247 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3252 bool isReady()
const {
3253 assert(isSchedulingEntity() &&
3254 "can't consider non-scheduling entity for ready list");
3255 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3261 int incrementUnscheduledDeps(
int Incr) {
3262 assert(hasValidDependencies() &&
3263 "increment of unscheduled deps would be meaningless");
3264 UnscheduledDeps += Incr;
3265 return FirstInBundle->unscheduledDepsInBundle();
3270 void resetUnscheduledDeps() {
3271 UnscheduledDeps = Dependencies;
3275 void clearDependencies() {
3276 Dependencies = InvalidDeps;
3277 resetUnscheduledDeps();
3278 MemoryDependencies.clear();
3279 ControlDependencies.clear();
3282 int unscheduledDepsInBundle()
const {
3283 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3285 for (
const ScheduleData *BundleMember =
this; BundleMember;
3286 BundleMember = BundleMember->NextInBundle) {
3287 if (BundleMember->UnscheduledDeps == InvalidDeps)
3289 Sum += BundleMember->UnscheduledDeps;
3295 if (!isSchedulingEntity()) {
3296 os <<
"/ " << *Inst;
3297 }
else if (NextInBundle) {
3299 ScheduleData *SD = NextInBundle;
3301 os <<
';' << *SD->Inst;
3302 SD = SD->NextInBundle;
3313 Value *OpValue =
nullptr;
3316 TreeEntry *
TE =
nullptr;
3320 ScheduleData *FirstInBundle =
nullptr;
3324 ScheduleData *NextInBundle =
nullptr;
3328 ScheduleData *NextLoadStore =
nullptr;
3342 int SchedulingRegionID = 0;
3345 int SchedulingPriority = 0;
3351 int Dependencies = InvalidDeps;
3357 int UnscheduledDeps = InvalidDeps;
3361 bool IsScheduled =
false;
3366 const BoUpSLP::ScheduleData &SD) {
3391 struct BlockScheduling {
3393 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3397 ScheduleStart =
nullptr;
3398 ScheduleEnd =
nullptr;
3399 FirstLoadStoreInRegion =
nullptr;
3400 LastLoadStoreInRegion =
nullptr;
3401 RegionHasStackSave =
false;
3405 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3408 ScheduleRegionSize = 0;
3412 ++SchedulingRegionID;
3416 if (BB !=
I->getParent())
3419 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3420 if (SD && isInSchedulingRegion(SD))
3425 ScheduleData *getScheduleData(
Value *V) {
3426 if (
auto *
I = dyn_cast<Instruction>(V))
3427 return getScheduleData(
I);
3431 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3433 return getScheduleData(V);
3434 auto I = ExtraScheduleDataMap.find(V);
3435 if (
I != ExtraScheduleDataMap.end()) {
3436 ScheduleData *SD =
I->second.lookup(Key);
3437 if (SD && isInSchedulingRegion(SD))
3443 bool isInSchedulingRegion(ScheduleData *SD)
const {
3444 return SD->SchedulingRegionID == SchedulingRegionID;
3449 template <
typename ReadyListType>
3450 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3451 SD->IsScheduled =
true;
3454 for (ScheduleData *BundleMember = SD; BundleMember;
3455 BundleMember = BundleMember->NextInBundle) {
3456 if (BundleMember->Inst != BundleMember->OpValue)
3462 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3463 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3464 if (OpDef && OpDef->hasValidDependencies() &&
3465 OpDef->incrementUnscheduledDeps(-1) == 0) {
3469 ScheduleData *DepBundle = OpDef->FirstInBundle;
3470 assert(!DepBundle->IsScheduled &&
3471 "already scheduled bundle gets ready");
3472 ReadyList.insert(DepBundle);
3474 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3482 if (TreeEntry *TE = BundleMember->TE) {
3484 int Lane = std::distance(
TE->Scalars.begin(),
3485 find(
TE->Scalars, BundleMember->Inst));
3486 assert(Lane >= 0 &&
"Lane not set");
3494 auto *
In = BundleMember->Inst;
3497 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3498 In->getNumOperands() ==
TE->getNumOperands()) &&
3499 "Missed TreeEntry operands?");
3502 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3503 OpIdx != NumOperands; ++OpIdx)
3504 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3509 for (
Use &U : BundleMember->Inst->operands())
3510 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3514 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3515 if (MemoryDepSD->hasValidDependencies() &&
3516 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3519 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3520 assert(!DepBundle->IsScheduled &&
3521 "already scheduled bundle gets ready");
3522 ReadyList.insert(DepBundle);
3524 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3528 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3529 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3532 ScheduleData *DepBundle = DepSD->FirstInBundle;
3533 assert(!DepBundle->IsScheduled &&
3534 "already scheduled bundle gets ready");
3535 ReadyList.insert(DepBundle);
3537 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3548 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3549 ScheduleStart->comesBefore(ScheduleEnd) &&
3550 "Not a valid scheduling region?");
3552 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3553 auto *SD = getScheduleData(
I);
3556 assert(isInSchedulingRegion(SD) &&
3557 "primary schedule data not in window?");
3558 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3559 "entire bundle in window!");
3561 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3564 for (
auto *SD : ReadyInsts) {
3565 assert(SD->isSchedulingEntity() && SD->isReady() &&
3566 "item in ready list not ready?");
3571 void doForAllOpcodes(
Value *V,
3573 if (ScheduleData *SD = getScheduleData(V))
3575 auto I = ExtraScheduleDataMap.find(V);
3576 if (
I != ExtraScheduleDataMap.end())
3577 for (
auto &
P :
I->second)
3578 if (isInSchedulingRegion(
P.second))
3583 template <
typename ReadyListType>
3584 void initialFillReadyList(ReadyListType &ReadyList) {
3585 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3586 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3587 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3589 ReadyList.insert(SD);
3591 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3606 std::optional<ScheduleData *>
3608 const InstructionsState &S);
3614 ScheduleData *allocateScheduleDataChunks();
3618 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3623 ScheduleData *PrevLoadStore,
3624 ScheduleData *NextLoadStore);
3628 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3632 void resetSchedule();
3653 ExtraScheduleDataMap;
3666 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3670 ScheduleData *LastLoadStoreInRegion =
nullptr;
3675 bool RegionHasStackSave =
false;
3678 int ScheduleRegionSize = 0;
3687 int SchedulingRegionID = 1;
3695 void scheduleBlock(BlockScheduling *BS);
3702 struct OrdersTypeDenseMapInfo {
3715 static unsigned getHashValue(
const OrdersType &V) {
3736 unsigned MaxVecRegSize;
3737 unsigned MinVecRegSize;
3752 unsigned ReductionBitWidth = 0;
3756 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3775 struct ChildIteratorType
3777 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3788 return R.VectorizableTree[0].get();
3792 return {
N->UserTreeIndices.begin(),
N->Container};
3796 return {
N->UserTreeIndices.end(),
N->Container};
3801 class nodes_iterator {
3812 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3816 return nodes_iterator(R->VectorizableTree.begin());
3820 return nodes_iterator(R->VectorizableTree.end());
3823 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3834 OS << Entry->Idx <<
".\n";
3837 for (
auto *V : Entry->Scalars) {
3839 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3840 return EU.Scalar == V;
3850 if (Entry->State == TreeEntry::NeedToGather)
3852 if (Entry->State == TreeEntry::ScatterVectorize ||
3853 Entry->State == TreeEntry::StridedVectorize)
3854 return "color=blue";
3863 for (
auto *
I : DeletedInstructions) {
3864 for (
Use &U :
I->operands()) {
3865 auto *
Op = dyn_cast<Instruction>(U.get());
3866 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3870 I->dropAllReferences();
3872 for (
auto *
I : DeletedInstructions) {
3874 "trying to erase instruction with users.");
3875 I->eraseFromParent();
3881#ifdef EXPENSIVE_CHECKS
3892 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3893 "Expected non-empty mask.");
3896 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3898 Reuses[Mask[
I]] = Prev[
I];
3906 bool BottomOrder =
false) {
3907 assert(!Mask.empty() &&
"Expected non-empty mask.");
3908 unsigned Sz = Mask.size();
3911 if (Order.
empty()) {
3913 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
3915 PrevOrder.
swap(Order);
3918 for (
unsigned I = 0;
I < Sz; ++
I)
3920 Order[
I] = PrevOrder[Mask[
I]];
3922 return Data.value() == Sz ||
Data.index() ==
Data.value();
3931 if (Order.
empty()) {
3933 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3943 for (
unsigned I = 0;
I < Sz; ++
I)
3945 Order[MaskOrder[
I]] =
I;
3949std::optional<BoUpSLP::OrdersType>
3951 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3955 Type *ScalarTy = GatheredScalars.
front()->getType();
3956 int NumScalars = GatheredScalars.
size();
3958 return std::nullopt;
3961 if (NumParts == 0 || NumParts >= NumScalars)
3967 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3969 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3972 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
3973 return std::nullopt;
3974 OrdersType CurrentOrder(NumScalars, NumScalars);
3975 if (GatherShuffles.
size() == 1 &&
3977 Entries.front().front()->isSame(TE.Scalars)) {
3980 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
3981 return CurrentOrder;
3985 return all_of(Mask, [&](
int I) {
3992 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
3993 (Entries.size() != 1 ||
3994 Entries.front().front()->ReorderIndices.empty())) ||
3995 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
3996 return std::nullopt;
4001 for (
int I : seq<int>(0, NumParts)) {
4002 if (ShuffledSubMasks.
test(
I))
4004 const int VF = GetVF(
I);
4009 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4010 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4011 ShuffledSubMasks.
set(
I);
4015 int FirstMin = INT_MAX;
4016 int SecondVecFound =
false;
4017 for (
int K : seq<int>(0, PartSz)) {
4018 int Idx = Mask[
I * PartSz + K];
4020 Value *V = GatheredScalars[
I * PartSz + K];
4022 SecondVecFound =
true;
4031 SecondVecFound =
true;
4035 FirstMin = (FirstMin / PartSz) * PartSz;
4037 if (SecondVecFound) {
4038 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4039 ShuffledSubMasks.
set(
I);
4042 for (
int K : seq<int>(0, PartSz)) {
4043 int Idx = Mask[
I * PartSz + K];
4047 if (
Idx >= PartSz) {
4048 SecondVecFound =
true;
4051 if (CurrentOrder[
I * PartSz +
Idx] >
4052 static_cast<unsigned>(
I * PartSz + K) &&
4053 CurrentOrder[
I * PartSz +
Idx] !=
4054 static_cast<unsigned>(
I * PartSz +
Idx))
4055 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4058 if (SecondVecFound) {
4059 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4060 ShuffledSubMasks.
set(
I);
4065 int PartSz = NumScalars / NumParts;
4066 if (!ExtractShuffles.
empty())
4067 TransformMaskToOrder(
4068 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4069 if (!ExtractShuffles[
I])
4072 for (
unsigned Idx : seq<unsigned>(0, PartSz)) {
4073 int K =
I * PartSz +
Idx;
4076 if (!TE.ReuseShuffleIndices.empty())
4077 K = TE.ReuseShuffleIndices[K];
4078 if (!TE.ReorderIndices.empty())
4079 K = std::distance(TE.ReorderIndices.begin(),
4080 find(TE.ReorderIndices, K));
4081 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4084 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4086 .getKnownMinValue());
4091 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4092 if (ShuffledSubMasks.
any())
4093 return std::nullopt;
4094 PartSz = NumScalars;
4097 if (!Entries.empty())
4098 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4099 if (!GatherShuffles[
I])
4101 return std::max(Entries[
I].front()->getVectorFactor(),
4102 Entries[
I].back()->getVectorFactor());
4105 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4106 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4107 return std::nullopt;
4108 return std::move(CurrentOrder);
4113 bool CompareOpcodes =
true) {
4116 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4119 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4122 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4126 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4131template <
typename T>
4133 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4135 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4136 return CommonAlignment;
4141 unsigned Sz = Order.
size();
4143 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4154static std::optional<Value *>
4160 const SCEV *PtrSCEVLowest =
nullptr;
4161 const SCEV *PtrSCEVHighest =
nullptr;
4167 return std::nullopt;
4169 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4170 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4174 if (isa<SCEVCouldNotCompute>(Diff))
4175 return std::nullopt;
4177 PtrSCEVLowest = PtrSCEV;
4181 if (isa<SCEVCouldNotCompute>(Diff1))
4182 return std::nullopt;
4184 PtrSCEVHighest = PtrSCEV;
4190 if (isa<SCEVCouldNotCompute>(Dist))
4191 return std::nullopt;
4192 int Size =
DL.getTypeStoreSize(ElemTy);
4193 auto TryGetStride = [&](
const SCEV *Dist,
4194 const SCEV *Multiplier) ->
const SCEV * {
4195 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4196 if (M->getOperand(0) == Multiplier)
4197 return M->getOperand(1);
4198 if (M->getOperand(1) == Multiplier)
4199 return M->getOperand(0);
4202 if (Multiplier == Dist)
4207 const SCEV *Stride =
nullptr;
4208 if (
Size != 1 || SCEVs.
size() > 2) {
4210 Stride = TryGetStride(Dist, Sz);
4212 return std::nullopt;
4214 if (!Stride || isa<SCEVConstant>(Stride))
4215 return std::nullopt;
4218 using DistOrdPair = std::pair<int64_t, int>;
4220 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4222 bool IsConsecutive =
true;
4223 for (
const SCEV *PtrSCEV : SCEVs) {
4225 if (PtrSCEV != PtrSCEVLowest) {
4227 const SCEV *Coeff = TryGetStride(Diff, Stride);
4229 return std::nullopt;
4230 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4231 if (!SC || isa<SCEVCouldNotCompute>(SC))
4232 return std::nullopt;
4236 return std::nullopt;
4237 Dist = SC->getAPInt().getZExtValue();
4241 return std::nullopt;
4242 auto Res = Offsets.emplace(Dist, Cnt);
4244 return std::nullopt;
4246 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4249 if (Offsets.size() != SCEVs.
size())
4250 return std::nullopt;
4251 SortedIndices.
clear();
4252 if (!IsConsecutive) {
4256 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4257 SortedIndices[Cnt] = Pair.second;
4278 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4284 const unsigned Sz = VL.
size();
4286 auto *POIter = PointerOps.
begin();
4287 for (
Value *V : VL) {
4288 auto *L = cast<LoadInst>(V);
4291 *POIter = L->getPointerOperand();
4302 "supported with VectorizeNonPowerOf2");
4306 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4317 if (Order.
empty()) {
4318 Ptr0 = PointerOps.
front();
4319 PtrN = PointerOps.
back();
4321 Ptr0 = PointerOps[Order.
front()];
4322 PtrN = PointerOps[Order.
back()];
4324 std::optional<int> Diff =
4327 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4330 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4342 (
static_cast<unsigned>(std::abs(*Diff)) <=
4345 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4346 *Diff == -(
static_cast<int>(Sz) - 1))) {
4347 int Stride = *Diff /
static_cast<int>(Sz - 1);
4348 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4360 else if (
Ptr != Ptr0)
4365 if (((Dist / Stride) * Stride) != Dist ||
4366 !Dists.
insert(Dist).second)
4369 if (Dists.
size() == Sz)
4375 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4376 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4378 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4379 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4380 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4381 unsigned VectorizedCnt = 0;
4383 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4384 Cnt += VF, ++VectorizedCnt) {
4402 if (VectorizedCnt == VL.
size() / VF) {
4406 Instruction::Load, VecTy,
4412 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4416 Instruction::Load, SubVecTy, LI0->getAlign(),
4417 LI0->getPointerAddressSpace(),
CostKind,
4422 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4427 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4432 "Expected only consecutive, strided or masked gather loads.");
4435 for (
int Idx : seq<int>(0, VL.
size()))
4439 ShuffleMask,
CostKind,
I * VF, SubVecTy);
4444 if (MaskedGatherCost > VecLdCost)
4454 bool ProfitableGatherPointers =
4457 return L->isLoopInvariant(V);
4459 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4460 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4462 (
GEP &&
GEP->getNumOperands() == 2 &&
4463 isa<Constant, Instruction>(
GEP->getOperand(1)));
4465 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4470 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4489 "Expected list of pointer operands.");
4494 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4499 std::optional<int> Diff =
4505 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4511 if (Bases.
size() > VL.
size() / 2 - 1)
4515 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4521 bool AnyConsecutive =
false;
4522 for (
auto &
Base : Bases) {
4523 auto &Vec =
Base.second;
4524 if (Vec.size() > 1) {
4526 const std::tuple<Value *, int, unsigned> &
Y) {
4527 return std::get<1>(
X) < std::get<1>(
Y);
4529 int InitialOffset = std::get<1>(Vec[0]);
4531 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4537 SortedIndices.
clear();
4538 if (!AnyConsecutive)
4541 for (
auto &
Base : Bases) {
4542 for (
auto &
T :
Base.second)
4547 "Expected SortedIndices to be the size of VL");
4551std::optional<BoUpSLP::OrdersType>
4553 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4554 Type *ScalarTy = TE.Scalars[0]->getType();
4557 Ptrs.
reserve(TE.Scalars.size());
4558 for (
Value *V : TE.Scalars) {
4559 auto *L = dyn_cast<LoadInst>(V);
4560 if (!L || !L->isSimple())
4561 return std::nullopt;
4567 return std::move(Order);
4568 return std::nullopt;
4579 if (VU->
getType() != V->getType())
4582 if (!VU->
hasOneUse() && !V->hasOneUse())
4588 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4594 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4595 bool IsReusedIdx =
false;
4597 if (IE2 == VU && !IE1)
4599 if (IE1 == V && !IE2)
4600 return V->hasOneUse();
4601 if (IE1 && IE1 != V) {
4603 IsReusedIdx |= ReusedIdx.
test(Idx1);
4604 ReusedIdx.
set(Idx1);
4605 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4608 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4610 if (IE2 && IE2 != VU) {
4612 IsReusedIdx |= ReusedIdx.
test(Idx2);
4613 ReusedIdx.
set(Idx2);
4614 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4617 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4619 }
while (!IsReusedIdx && (IE1 || IE2));
4623std::optional<BoUpSLP::OrdersType>
4626 if (TE.isNonPowOf2Vec())
4627 return std::nullopt;
4631 if (!TE.ReuseShuffleIndices.empty()) {
4633 return std::nullopt;
4641 unsigned Sz = TE.Scalars.size();
4642 if (TE.State == TreeEntry::NeedToGather) {
4643 if (std::optional<OrdersType> CurrentOrder =
4648 ::addMask(Mask, TE.ReuseShuffleIndices);
4649 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4650 unsigned Sz = TE.Scalars.size();
4651 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4654 Res[
Idx + K * Sz] =
I + K * Sz;
4656 return std::move(Res);
4659 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4661 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4662 return std::nullopt;
4666 if (TE.ReorderIndices.empty())
4667 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4670 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4671 unsigned VF = ReorderMask.
size();
4673 unsigned NumParts = VF / Sz;
4675 for (
unsigned I = 0;
I < VF;
I += Sz) {
4677 unsigned UndefCnt = 0;
4686 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4688 return std::nullopt;
4690 for (
unsigned K = 0; K < NumParts; ++K)
4691 ResOrder[Val + Sz * K] =
I + K;
4693 return std::move(ResOrder);
4695 unsigned VF = TE.getVectorFactor();
4698 TE.ReuseShuffleIndices.end());
4699 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4701 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4702 return Idx && *Idx < Sz;
4705 if (TE.ReorderIndices.empty())
4706 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4709 for (
unsigned I = 0;
I < VF; ++
I) {
4710 int &
Idx = ReusedMask[
I];
4713 Value *V = TE.Scalars[ReorderMask[
Idx]];
4715 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4721 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4722 auto *It = ResOrder.
begin();
4723 for (
unsigned K = 0; K < VF; K += Sz) {
4727 std::iota(SubMask.begin(), SubMask.end(), 0);
4729 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4730 std::advance(It, Sz);
4732 if (TE.State == TreeEntry::NeedToGather &&
4734 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4735 return std::nullopt;
4736 return std::move(ResOrder);
4738 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4739 any_of(TE.UserTreeIndices,
4741 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4743 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4744 return std::nullopt;
4745 if ((TE.State == TreeEntry::Vectorize ||
4746 TE.State == TreeEntry::StridedVectorize) &&
4747 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4748 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4750 return TE.ReorderIndices;
4751 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4752 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4753 Value *V1 = TE.Scalars[I1];
4754 Value *V2 = TE.Scalars[I2];
4755 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4761 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4762 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4763 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4764 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4771 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4772 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4773 if (EE1->getOperand(0) != EE2->getOperand(0))
4779 auto IsIdentityOrder = [](
const OrdersType &Order) {
4780 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4785 if (!TE.ReorderIndices.empty())
4786 return TE.ReorderIndices;
4789 std::iota(Phis.begin(), Phis.end(), 0);
4791 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4794 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4795 ResOrder[Id] = PhiToId[Phis[Id]];
4796 if (IsIdentityOrder(ResOrder))
4797 return std::nullopt;
4798 return std::move(ResOrder);
4800 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4804 if ((TE.getOpcode() == Instruction::ExtractElement ||
4805 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4806 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4808 auto *EE = dyn_cast<ExtractElementInst>(V);
4809 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4814 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4816 if (Reuse || !CurrentOrder.
empty())
4817 return std::move(CurrentOrder);
4825 int Sz = TE.Scalars.size();
4827 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4829 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4830 if (It == TE.Scalars.begin())
4833 if (It != TE.Scalars.end()) {
4835 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4850 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4853 return std::move(Order);
4858 return std::nullopt;
4859 if (TE.Scalars.size() >= 4)
4863 return CurrentOrder;
4865 return std::nullopt;
4875 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4877 if (Cluster != FirstCluster)
4883void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4886 const unsigned Sz =
TE.Scalars.size();
4888 if (
TE.State != TreeEntry::NeedToGather ||
4895 addMask(NewMask,
TE.ReuseShuffleIndices);
4897 TE.ReorderIndices.clear();
4904 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4905 *
End =
TE.ReuseShuffleIndices.end();
4906 It !=
End; std::advance(It, Sz))
4907 std::iota(It, std::next(It, Sz), 0);
4913 "Expected same size of orders");
4914 unsigned Sz = Order.
size();
4916 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
4917 if (Order[
Idx] != Sz)
4918 UsedIndices.
set(Order[
Idx]);
4920 if (SecondaryOrder.
empty()) {
4921 for (
unsigned Idx : seq<unsigned>(0, Sz))
4922 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
4925 for (
unsigned Idx : seq<unsigned>(0, Sz))
4926 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
4927 !UsedIndices.
test(SecondaryOrder[
Idx]))
4928 Order[
Idx] = SecondaryOrder[
Idx];
4948 ExternalUserReorderMap;
4953 const std::unique_ptr<TreeEntry> &TE) {
4956 findExternalStoreUsersReorderIndices(TE.get());
4957 if (!ExternalUserReorderIndices.
empty()) {
4958 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4960 std::move(ExternalUserReorderIndices));
4966 if (TE->isAltShuffle()) {
4969 unsigned Opcode0 = TE->getOpcode();
4970 unsigned Opcode1 = TE->getAltOpcode();
4973 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4974 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4975 OpcodeMask.
set(Lane);
4977 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4978 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4984 if (std::optional<OrdersType> CurrentOrder =
4994 const TreeEntry *UserTE = TE.get();
4996 if (UserTE->UserTreeIndices.size() != 1)
4999 return EI.UserTE->State == TreeEntry::Vectorize &&
5000 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5003 UserTE = UserTE->UserTreeIndices.back().UserTE;
5006 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5007 if (!(TE->State == TreeEntry::Vectorize ||
5008 TE->State == TreeEntry::StridedVectorize) ||
5009 !TE->ReuseShuffleIndices.empty())
5010 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5011 if (TE->State == TreeEntry::Vectorize &&
5012 TE->getOpcode() == Instruction::PHI)
5013 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5018 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5020 auto It = VFToOrderedEntries.
find(VF);
5021 if (It == VFToOrderedEntries.
end())
5033 for (
const TreeEntry *OpTE : OrderedEntries) {
5036 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5039 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5041 if (OpTE->State == TreeEntry::NeedToGather ||
5042 !OpTE->ReuseShuffleIndices.empty()) {
5043 auto It = GathersToOrders.find(OpTE);
5044 if (It != GathersToOrders.end())
5047 if (OpTE->isAltShuffle()) {
5048 auto It = AltShufflesToOrders.find(OpTE);
5049 if (It != AltShufflesToOrders.end())
5052 if (OpTE->State == TreeEntry::Vectorize &&
5053 OpTE->getOpcode() == Instruction::PHI) {
5054 auto It = PhisToOrders.
find(OpTE);
5055 if (It != PhisToOrders.
end())
5058 return OpTE->ReorderIndices;
5061 auto It = ExternalUserReorderMap.
find(OpTE);
5062 if (It != ExternalUserReorderMap.
end()) {
5063 const auto &ExternalUserReorderIndices = It->second;
5067 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5068 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5069 ExternalUserReorderIndices.size();
5071 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5072 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5079 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5080 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5083 unsigned E = Order.size();
5086 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5089 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5091 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5094 if (OrdersUses.empty())
5097 const unsigned Sz = Order.size();
5098 for (
unsigned Idx : seq<unsigned>(0, Sz))
5099 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5104 unsigned IdentityCnt = 0;
5105 unsigned FilledIdentityCnt = 0;
5107 for (
auto &Pair : OrdersUses) {
5108 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5109 if (!Pair.first.empty())
5110 FilledIdentityCnt += Pair.second;
5111 IdentityCnt += Pair.second;
5116 unsigned Cnt = IdentityCnt;
5117 for (
auto &Pair : OrdersUses) {
5121 if (Cnt < Pair.second ||
5122 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5123 Cnt == Pair.second && !BestOrder.
empty() &&
5124 IsIdentityOrder(BestOrder))) {
5126 BestOrder = Pair.first;
5133 if (IsIdentityOrder(BestOrder))
5139 unsigned E = BestOrder.
size();
5141 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5144 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5146 if (TE->Scalars.size() != VF) {
5147 if (TE->ReuseShuffleIndices.size() == VF) {
5153 return EI.UserTE->Scalars.size() == VF ||
5154 EI.UserTE->Scalars.size() ==
5157 "All users must be of VF size.");
5160 reorderNodeWithReuses(*TE, Mask);
5164 if ((TE->State == TreeEntry::Vectorize ||
5165 TE->State == TreeEntry::StridedVectorize) &&
5168 !TE->isAltShuffle()) {
5172 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5173 TE->reorderOperands(Mask);
5176 TE->reorderOperands(Mask);
5177 assert(TE->ReorderIndices.empty() &&
5178 "Expected empty reorder sequence.");
5181 if (!TE->ReuseShuffleIndices.empty()) {
5188 addMask(NewReuses, TE->ReuseShuffleIndices);
5189 TE->ReuseShuffleIndices.swap(NewReuses);
5195bool BoUpSLP::canReorderOperands(
5196 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5200 if (UserTE->isNonPowOf2Vec())
5203 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5204 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5205 return OpData.first ==
I &&
5206 (OpData.second->State == TreeEntry::Vectorize ||
5207 OpData.second->State == TreeEntry::StridedVectorize);
5210 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5212 if (
any_of(TE->UserTreeIndices,
5213 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5217 Edges.emplace_back(
I, TE);
5223 if (TE->State != TreeEntry::Vectorize &&
5224 TE->State != TreeEntry::StridedVectorize &&
5225 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5229 TreeEntry *
Gather =
nullptr;
5231 [&
Gather, UserTE,
I](TreeEntry *TE) {
5232 assert(TE->State != TreeEntry::Vectorize &&
5233 TE->State != TreeEntry::StridedVectorize &&
5234 "Only non-vectorized nodes are expected.");
5235 if (
any_of(TE->UserTreeIndices,
5236 [UserTE,
I](
const EdgeInfo &EI) {
5237 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5239 assert(TE->isSame(UserTE->getOperand(
I)) &&
5240 "Operand entry does not match operands.");
5261 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5262 if (TE->State != TreeEntry::Vectorize &&
5263 TE->State != TreeEntry::StridedVectorize)
5265 if (std::optional<OrdersType> CurrentOrder =
5267 OrderedEntries.
insert(TE.get());
5268 if (!(TE->State == TreeEntry::Vectorize ||
5269 TE->State == TreeEntry::StridedVectorize) ||
5270 !TE->ReuseShuffleIndices.empty())
5271 GathersToOrders.
insert(TE.get());
5280 while (!OrderedEntries.
empty()) {
5285 for (TreeEntry *TE : OrderedEntries) {
5286 if (!(TE->State == TreeEntry::Vectorize ||
5287 TE->State == TreeEntry::StridedVectorize ||
5288 (TE->State == TreeEntry::NeedToGather &&
5290 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5293 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5295 !Visited.
insert(TE).second) {
5301 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5302 TreeEntry *UserTE = EI.
UserTE;
5303 auto It =
Users.find(UserTE);
5304 if (It ==
Users.end())
5305 It =
Users.insert({UserTE, {}}).first;
5306 It->second.emplace_back(EI.
EdgeIdx, TE);
5310 for (TreeEntry *TE : Filtered)
5311 OrderedEntries.remove(TE);
5313 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5315 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5316 return Data1.first->Idx > Data2.first->Idx;
5318 for (
auto &
Data : UsersVec) {
5321 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5323 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5324 OrderedEntries.remove(
Op.second);
5337 for (
const auto &
Op :
Data.second) {
5338 TreeEntry *OpTE =
Op.second;
5339 if (!VisitedOps.
insert(OpTE).second)
5341 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5343 const auto Order = [&]() ->
const OrdersType {
5344 if (OpTE->State == TreeEntry::NeedToGather ||
5345 !OpTE->ReuseShuffleIndices.empty())
5348 return OpTE->ReorderIndices;
5352 if (Order.size() == 1)
5355 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5356 return P.second == OpTE;
5359 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5360 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5363 unsigned E = Order.size();
5366 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5369 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5372 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5374 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5375 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5377 if (TE->isNonPowOf2Vec())
5379 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5380 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5381 (IgnoreReorder && TE->Idx == 0))
5383 if (TE->State == TreeEntry::NeedToGather) {
5392 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5393 TreeEntry *UserTE = EI.
UserTE;
5394 if (!VisitedUsers.
insert(UserTE).second)
5399 if (AllowsReordering(UserTE))
5407 if (
static_cast<unsigned>(
count_if(
5408 Ops, [UserTE, &AllowsReordering](
5409 const std::pair<unsigned, TreeEntry *> &
Op) {
5410 return AllowsReordering(
Op.second) &&
5413 return EI.UserTE == UserTE;
5415 })) <= Ops.
size() / 2)
5416 ++Res.first->second;
5419 if (OrdersUses.empty()) {
5420 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5421 OrderedEntries.remove(
Op.second);
5425 const unsigned Sz = Order.size();
5426 for (
unsigned Idx : seq<unsigned>(0, Sz))
5427 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5432 unsigned IdentityCnt = 0;
5433 unsigned VF =
Data.second.front().second->getVectorFactor();
5435 for (
auto &Pair : OrdersUses) {
5436 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5437 IdentityCnt += Pair.second;
5442 unsigned Cnt = IdentityCnt;
5443 for (
auto &Pair : OrdersUses) {
5447 if (Cnt < Pair.second) {
5449 BestOrder = Pair.first;
5456 if (IsIdentityOrder(BestOrder)) {
5457 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5458 OrderedEntries.remove(
Op.second);
5467 unsigned E = BestOrder.
size();
5469 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5471 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5472 TreeEntry *TE =
Op.second;
5473 OrderedEntries.remove(TE);
5474 if (!VisitedOps.
insert(TE).second)
5476 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5477 reorderNodeWithReuses(*TE, Mask);
5481 if (TE->State != TreeEntry::Vectorize &&
5482 TE->State != TreeEntry::StridedVectorize &&
5483 (TE->State != TreeEntry::ScatterVectorize ||
5484 TE->ReorderIndices.empty()))
5486 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5487 TE->ReorderIndices.empty()) &&
5488 "Non-matching sizes of user/operand entries.");
5490 if (IgnoreReorder && TE == VectorizableTree.front().get())
5491 IgnoreReorder =
false;
5494 for (TreeEntry *
Gather : GatherOps) {
5496 "Unexpected reordering of gathers.");
5497 if (!
Gather->ReuseShuffleIndices.empty()) {
5503 OrderedEntries.remove(
Gather);
5507 if (
Data.first->State != TreeEntry::Vectorize ||
5508 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5509 Data.first->getMainOp()) ||
5510 Data.first->isAltShuffle())
5511 Data.first->reorderOperands(Mask);
5512 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5513 Data.first->isAltShuffle() ||
5514 Data.first->State == TreeEntry::StridedVectorize) {
5518 if (
Data.first->ReuseShuffleIndices.empty() &&
5519 !
Data.first->ReorderIndices.empty() &&
5520 !
Data.first->isAltShuffle()) {
5523 OrderedEntries.insert(
Data.first);
5531 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5532 VectorizableTree.front()->ReuseShuffleIndices.empty())
5533 VectorizableTree.front()->ReorderIndices.clear();
5540 for (
auto &TEPtr : VectorizableTree) {
5541 TreeEntry *Entry = TEPtr.get();
5544 if (Entry->State == TreeEntry::NeedToGather)
5548 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5549 Value *Scalar = Entry->Scalars[Lane];
5550 if (!isa<Instruction>(Scalar))
5553 auto It = ScalarToExtUses.
find(Scalar);
5554 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5558 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5559 if (ExtI != ExternallyUsedValues.
end()) {
5560 int FoundLane = Entry->findLaneForValue(Scalar);
5561 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5562 << FoundLane <<
" from " << *Scalar <<
".\n");
5563 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5564 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5567 for (
User *U : Scalar->users()) {
5575 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5579 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5583 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5585 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5586 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5588 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5592 if (It != ScalarToExtUses.
end()) {
5593 ExternalUses[It->second].User =
nullptr;
5598 int FoundLane = Entry->findLaneForValue(Scalar);
5600 <<
" from lane " << FoundLane <<
" from " << *Scalar
5602 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5603 ExternalUses.emplace_back(Scalar, U, FoundLane);
5612BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5614 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5615 Value *V = TE->Scalars[Lane];
5621 for (
User *U : V->users()) {
5622 auto *SI = dyn_cast<StoreInst>(U);
5623 if (SI ==
nullptr || !SI->isSimple() ||
5627 if (getTreeEntry(U))
5631 auto &StoresVec = PtrToStoresMap[
Ptr];
5634 if (StoresVec.size() > Lane)
5637 if (!StoresVec.empty() &&
5638 SI->getParent() != StoresVec.back()->getParent())
5641 if (!StoresVec.empty() &&
5642 SI->getValueOperand()->getType() !=
5643 StoresVec.back()->getValueOperand()->getType())
5645 StoresVec.push_back(SI);
5648 return PtrToStoresMap;
5652 OrdersType &ReorderIndices)
const {
5660 StoreOffsetVec[0] = {S0, 0};
5663 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5665 std::optional<int> Diff =
5667 SI->getPointerOperand(), *
DL, *SE,
5672 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5677 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5678 const std::pair<StoreInst *, int> &Pair2) {
5679 int Offset1 = Pair1.second;
5680 int Offset2 = Pair2.second;
5681 return Offset1 < Offset2;
5685 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5686 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5691 ReorderIndices.reserve(StoresVec.
size());
5694 [SI](
const std::pair<StoreInst *, int> &Pair) {
5695 return Pair.first ==
SI;
5697 StoreOffsetVec.begin();
5698 ReorderIndices.push_back(
Idx);
5703 auto IsIdentityOrder = [](
const OrdersType &Order) {
5704 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5709 if (IsIdentityOrder(ReorderIndices))
5710 ReorderIndices.clear();
5717 for (
unsigned Idx : Order)
5724BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5725 unsigned NumLanes =
TE->Scalars.size();
5728 collectUserStores(TE);
5737 for (
const auto &Pair : PtrToStoresMap) {
5738 auto &StoresVec = Pair.second;
5740 if (StoresVec.size() != NumLanes)
5745 if (!canFormVector(StoresVec, ReorderIndices))
5750 ExternalReorderIndices.
push_back(ReorderIndices);
5752 return ExternalReorderIndices;
5758 UserIgnoreList = &UserIgnoreLst;
5761 buildTree_rec(Roots, 0,
EdgeInfo());
5768 buildTree_rec(Roots, 0,
EdgeInfo());
5775 Value *NeedsScheduling =
nullptr;
5776 for (
Value *V : VL) {
5779 if (!NeedsScheduling) {
5780 NeedsScheduling = V;
5785 return NeedsScheduling;
5796 bool AllowAlternate) {
5800 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5803 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5808 if (isa<ExtractElementInst, UndefValue>(V))
5810 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5812 !isa<UndefValue>(EI->getIndexOperand()))
5815 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5818 if ((isa<BinaryOperator, CastInst>(
I)) &&
5828 : cast<CastInst>(
I)->getOperand(0)->getType()));
5830 if (isa<CastInst>(
I)) {
5831 std::pair<size_t, size_t> OpVals =
5837 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5839 if (CI->isCommutative())
5845 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5859 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5860 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5861 SubKey =
hash_value(Gep->getPointerOperand());
5865 !isa<ConstantInt>(
I->getOperand(1))) {
5873 return std::make_pair(Key, SubKey);
5883bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
5885 unsigned Opcode0 = S.getOpcode();
5886 unsigned Opcode1 = S.getAltOpcode();
5889 for (
unsigned Lane : seq<unsigned>(0, VL.
size()))
5890 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
5891 OpcodeMask.set(Lane);
5894 Opcode0, Opcode1, OpcodeMask))
5897 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5901 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
5905 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5911 switch (Res.value_or(0)) {
5926 constexpr unsigned NumAltInsts = 3;
5927 unsigned NonInstCnt = 0;
5930 unsigned UndefCnt = 0;
5932 unsigned ExtraShuffleInsts = 0;
5941 return is_contained(Operands.back(), V);
5944 ++ExtraShuffleInsts;
5961 if (isa<Constant, ExtractElementInst>(V) ||
5962 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
5963 if (isa<UndefValue>(V))
5969 if (!Res.second && Res.first->second == 1)
5970 ++ExtraShuffleInsts;
5971 ++Res.first->getSecond();
5972 if (
auto *
I = dyn_cast<Instruction>(V))
5973 UniqueOpcodes.
insert(
I->getOpcode());
5974 else if (Res.second)
5977 return none_of(Uniques, [&](
const auto &
P) {
5978 return P.first->hasNUsesOrMore(
P.second + 1) &&
5980 return getTreeEntry(U) || Uniques.contains(U);
5989 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5990 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
5991 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5994BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5997 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
5999 unsigned ShuffleOrOp =
6000 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
6001 auto *VL0 = cast<Instruction>(S.OpValue);
6002 switch (ShuffleOrOp) {
6003 case Instruction::PHI: {
6006 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
6008 if (Term &&
Term->isTerminator()) {
6010 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
6011 return TreeEntry::NeedToGather;
6015 return TreeEntry::Vectorize;
6017 case Instruction::ExtractValue:
6018 case Instruction::ExtractElement: {
6019 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6022 return TreeEntry::NeedToGather;
6023 if (Reuse || !CurrentOrder.empty())
6024 return TreeEntry::Vectorize;
6026 return TreeEntry::NeedToGather;
6028 case Instruction::InsertElement: {
6032 for (
Value *V : VL) {
6033 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6035 "Non-constant or undef index?");
6039 return !SourceVectors.contains(V);
6042 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6043 "different source vectors.\n");
6044 return TreeEntry::NeedToGather;
6047 return TreeEntry::Vectorize;
6049 case Instruction::Load: {
6058 return TreeEntry::Vectorize;
6060 return TreeEntry::ScatterVectorize;
6062 return TreeEntry::StridedVectorize;
6065 Type *ScalarTy = VL0->getType();
6066 if (
DL->getTypeSizeInBits(ScalarTy) !=
6067 DL->getTypeAllocSizeInBits(ScalarTy))
6068 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6070 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6075 return TreeEntry::NeedToGather;
6079 case Instruction::ZExt:
6080 case Instruction::SExt:
6081 case Instruction::FPToUI:
6082 case Instruction::FPToSI:
6083 case Instruction::FPExt:
6084 case Instruction::PtrToInt:
6085 case Instruction::IntToPtr:
6086 case Instruction::SIToFP:
6087 case Instruction::UIToFP:
6088 case Instruction::Trunc:
6089 case Instruction::FPTrunc:
6090 case Instruction::BitCast: {
6091 Type *SrcTy = VL0->getOperand(0)->getType();
6092 for (
Value *V : VL) {
6093 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6096 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6097 return TreeEntry::NeedToGather;
6100 return TreeEntry::Vectorize;
6102 case Instruction::ICmp:
6103 case Instruction::FCmp: {
6107 Type *ComparedTy = VL0->getOperand(0)->getType();
6108 for (
Value *V : VL) {
6110 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6111 Cmp->getOperand(0)->getType() != ComparedTy) {
6112 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6113 return TreeEntry::NeedToGather;
6116 return TreeEntry::Vectorize;
6118 case Instruction::Select:
6119 case Instruction::FNeg:
6120 case Instruction::Add:
6121 case Instruction::FAdd:
6122 case Instruction::Sub:
6123 case Instruction::FSub:
6124 case Instruction::Mul:
6125 case Instruction::FMul:
6126 case Instruction::UDiv:
6127 case Instruction::SDiv:
6128 case Instruction::FDiv:
6129 case Instruction::URem:
6130 case Instruction::SRem:
6131 case Instruction::FRem:
6132 case Instruction::Shl:
6133 case Instruction::LShr:
6134 case Instruction::AShr:
6135 case Instruction::And:
6136 case Instruction::Or:
6137 case Instruction::Xor:
6138 return TreeEntry::Vectorize;
6139 case Instruction::GetElementPtr: {
6141 for (
Value *V : VL) {
6142 auto *
I = dyn_cast<GetElementPtrInst>(V);
6145 if (
I->getNumOperands() != 2) {
6146 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6147 return TreeEntry::NeedToGather;
6153 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6154 for (
Value *V : VL) {
6155 auto *
GEP = dyn_cast<GEPOperator>(V);
6158 Type *CurTy =
GEP->getSourceElementType();
6160 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6161 return TreeEntry::NeedToGather;
6166 Type *Ty1 = VL0->getOperand(1)->getType();
6167 for (
Value *V : VL) {
6168 auto *
I = dyn_cast<GetElementPtrInst>(V);
6171 auto *
Op =
I->getOperand(1);
6172 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6173 (
Op->getType() != Ty1 &&
6174 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6175 Op->getType()->getScalarSizeInBits() >
6176 DL->getIndexSizeInBits(
6177 V->getType()->getPointerAddressSpace())))) {
6179 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6180 return TreeEntry::NeedToGather;
6184 return TreeEntry::Vectorize;
6186 case Instruction::Store: {
6188 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6191 if (
DL->getTypeSizeInBits(ScalarTy) !=
6192 DL->getTypeAllocSizeInBits(ScalarTy)) {
6193 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6194 return TreeEntry::NeedToGather;
6198 for (
Value *V : VL) {
6199 auto *
SI = cast<StoreInst>(V);
6200 if (!
SI->isSimple()) {
6202 return TreeEntry::NeedToGather;
6211 if (CurrentOrder.empty()) {
6212 Ptr0 = PointerOps.
front();
6213 PtrN = PointerOps.
back();
6215 Ptr0 = PointerOps[CurrentOrder.front()];
6216 PtrN = PointerOps[CurrentOrder.back()];
6218 std::optional<int> Dist =
6221 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6222 return TreeEntry::Vectorize;
6226 return TreeEntry::NeedToGather;
6228 case Instruction::Call: {
6231 CallInst *CI = cast<CallInst>(VL0);
6242 return TreeEntry::NeedToGather;
6247 for (
unsigned J = 0; J != NumArgs; ++J)
6250 for (
Value *V : VL) {
6251 CallInst *CI2 = dyn_cast<CallInst>(V);
6257 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6259 return TreeEntry::NeedToGather;
6263 for (
unsigned J = 0; J != NumArgs; ++J) {
6266 if (ScalarArgs[J] != A1J) {
6268 <<
"SLP: mismatched arguments in call:" << *CI
6269 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6270 return TreeEntry::NeedToGather;
6279 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6280 <<
"!=" << *V <<
'\n');
6281 return TreeEntry::NeedToGather;
6285 return TreeEntry::Vectorize;
6287 case Instruction::ShuffleVector: {
6290 if (!S.isAltShuffle()) {
6291 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6292 return TreeEntry::NeedToGather;
6297 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6298 "the whole alt sequence is not profitable.\n");
6299 return TreeEntry::NeedToGather;
6302 return TreeEntry::Vectorize;
6306 return TreeEntry::NeedToGather;
6311 const EdgeInfo &UserTreeIdx) {
6317 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6318 bool DoNotFail =
false) {
6321 for (
Value *V : VL) {
6328 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6333 size_t NumUniqueScalarValues = UniqueValues.
size();
6334 if (NumUniqueScalarValues == VL.size()) {
6335 ReuseShuffleIndicies.
clear();
6338 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6339 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6340 "for nodes with padding.\n");
6341 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6345 if (NumUniqueScalarValues <= 1 ||
6346 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6348 return isa<UndefValue>(V) ||
6351 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6352 if (DoNotFail && UniquePositions.size() > 1 &&
6353 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6355 return isa<ExtractElementInst>(V) ||
6356 areAllUsersVectorized(cast<Instruction>(V),
6360 if (PWSz == VL.size()) {
6361 ReuseShuffleIndicies.
clear();
6363 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6364 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6365 UniqueValues.
back());
6366 VL = NonUniqueValueVL;
6371 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6382 if (!EphValues.
empty()) {
6383 for (
Value *V : VL) {
6384 if (EphValues.
count(V)) {
6386 <<
") is ephemeral.\n");
6387 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6397 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6402 cast<Instruction>(
I)->getOpcode() ==
6403 cast<Instruction>(S.MainOp)->getOpcode();
6405 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6406 if (TryToFindDuplicates(S))
6407 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6408 ReuseShuffleIndicies);
6413 if (S.getOpcode() == Instruction::ExtractElement &&
6414 isa<ScalableVectorType>(
6415 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6416 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6417 if (TryToFindDuplicates(S))
6418 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6419 ReuseShuffleIndicies);
6424 if (S.OpValue->getType()->isVectorTy() &&
6425 !isa<InsertElementInst>(S.OpValue)) {
6427 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6431 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6432 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6433 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6434 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6443 auto &&NotProfitableForVectorization = [&S,
this,
6445 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6454 for (
Value *V : VL) {
6455 auto *
I = cast<Instruction>(V);
6457 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6461 if ((IsCommutative &&
6462 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6464 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6466 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6468 auto *
I1 = cast<Instruction>(VL.front());
6469 auto *I2 = cast<Instruction>(VL.back());
6470 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6472 I2->getOperand(
Op));
6473 if (
static_cast<unsigned>(
count_if(
6474 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6476 })) >= S.MainOp->getNumOperands() / 2)
6478 if (S.MainOp->getNumOperands() > 2)
6480 if (IsCommutative) {
6483 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6485 I2->getOperand((
Op + 1) %
E));
6487 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6496 bool IsScatterVectorizeUserTE =
6497 UserTreeIdx.UserTE &&
6498 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6499 bool AreAllSameInsts =
6501 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6505 auto *
I = dyn_cast<GetElementPtrInst>(V);
6509 BB =
I->getParent();
6510 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6516 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6519 NotProfitableForVectorization(VL)) {
6520 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6521 if (TryToFindDuplicates(S))
6522 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6523 ReuseShuffleIndicies);
6531 if (TreeEntry *
E = getTreeEntry(S.OpValue)) {
6532 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6533 if (!
E->isSame(VL)) {
6534 auto It = MultiNodeScalars.
find(S.OpValue);
6535 if (It != MultiNodeScalars.
end()) {
6536 auto *TEIt =
find_if(It->getSecond(),
6537 [&](TreeEntry *ME) { return ME->isSame(VL); });
6538 if (TEIt != It->getSecond().end())
6548 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6549 if (TryToFindDuplicates(S))
6550 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6551 ReuseShuffleIndicies);
6557 E->UserTreeIndices.push_back(UserTreeIdx);
6558 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6565 for (
Value *V : VL) {
6566 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6569 if (getTreeEntry(V)) {
6571 <<
") is already in tree.\n");
6572 if (TryToFindDuplicates(S))
6573 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6574 ReuseShuffleIndicies);
6580 if (UserIgnoreList && !UserIgnoreList->empty()) {
6581 for (
Value *V : VL) {
6582 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6583 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6584 if (TryToFindDuplicates(S))
6585 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6586 ReuseShuffleIndicies);
6594 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6595 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6597 assert(S.OpValue->getType()->isPointerTy() &&
6598 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6599 "Expected pointers only.");
6601 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6602 assert(It != VL.end() &&
"Expected at least one GEP.");
6608 auto *VL0 = cast<Instruction>(S.OpValue);
6615 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6624 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6629 if (!TryToFindDuplicates(S,
true))
6635 TreeEntry::EntryState State = getScalarsVectorizationState(
6636 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6637 if (State == TreeEntry::NeedToGather) {
6638 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6639 ReuseShuffleIndicies);
6643 auto &BSRef = BlocksSchedules[BB];
6645 BSRef = std::make_unique<BlockScheduling>(BB);
6647 BlockScheduling &BS = *BSRef;
6649 std::optional<ScheduleData *> Bundle =
6650 BS.tryScheduleBundle(UniqueValues,
this, S);
6651#ifdef EXPENSIVE_CHECKS
6656 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6657 assert((!BS.getScheduleData(VL0) ||
6658 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6659 "tryScheduleBundle should cancelScheduling on failure");
6660 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6661 ReuseShuffleIndicies);
6662 NonScheduledFirst.insert(VL.front());
6665 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6667 unsigned ShuffleOrOp = S.isAltShuffle() ?
6668 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6669 switch (ShuffleOrOp) {
6670 case Instruction::PHI: {
6671 auto *PH = cast<PHINode>(VL0);
6674 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6679 for (
unsigned I = 0,
E = PH->getNumIncomingValues();
I <
E; ++
I) {
6689 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6690 PH->getIncomingBlock(
I)));
6694 for (
unsigned OpIdx = 0, OpE = OperandsVec.
size(); OpIdx != OpE; ++OpIdx)
6695 buildTree_rec(OperandsVec[OpIdx],
Depth + 1, {
TE, OpIdx});
6698 case Instruction::ExtractValue:
6699 case Instruction::ExtractElement: {
6700 if (CurrentOrder.empty()) {
6701 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6702 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6703 ReuseShuffleIndicies);
6707 Op0.
assign(VL.size(), VL0->getOperand(0));
6708 VectorizableTree.back()->setOperand(0, Op0);
6712 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6714 for (
unsigned Idx : CurrentOrder)
6721 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6722 ReuseShuffleIndicies, CurrentOrder);
6726 Op0.
assign(VL.size(), VL0->getOperand(0));
6727 VectorizableTree.back()->setOperand(0, Op0);
6730 case Instruction::InsertElement: {
6731 assert(ReuseShuffleIndicies.
empty() &&
"All inserts should be unique");
6733 auto OrdCompare = [](
const std::pair<int, int> &P1,
6734 const std::pair<int, int> &P2) {
6735 return P1.first > P2.first;
6738 decltype(OrdCompare)>
6739 Indices(OrdCompare);
6740 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6742 Indices.emplace(
Idx,
I);
6744 OrdersType CurrentOrder(VL.size(), VL.size());
6745 bool IsIdentity =
true;
6746 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6747 CurrentOrder[Indices.top().second] =
I;
6748 IsIdentity &= Indices.top().second ==
I;
6752 CurrentOrder.clear();
6753 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6754 std::nullopt, CurrentOrder);
6757 constexpr int NumOps = 2;
6759 for (
int I = 0;
I < NumOps; ++
I) {
6761 VectorOperands[
I].
push_back(cast<Instruction>(V)->getOperand(
I));
6763 TE->setOperand(
I, VectorOperands[
I]);
6765 buildTree_rec(VectorOperands[NumOps - 1],
Depth + 1, {
TE, NumOps - 1});
6768 case Instruction::Load: {
6775 TreeEntry *
TE =
nullptr;
6778 case TreeEntry::Vectorize:
6779 if (CurrentOrder.empty()) {
6781 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6782 ReuseShuffleIndicies);
6786 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6787 ReuseShuffleIndicies, CurrentOrder);
6790 TE->setOperandsInOrder();
6792 case TreeEntry::StridedVectorize:
6794 if (CurrentOrder.empty()) {
6795 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6796 UserTreeIdx, ReuseShuffleIndicies);
6798 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6799 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6801 TE->setOperandsInOrder();
6804 case TreeEntry::ScatterVectorize:
6806 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6807 UserTreeIdx, ReuseShuffleIndicies);
6808 TE->setOperandsInOrder();
6809 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6810 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6812 case TreeEntry::NeedToGather:
6817 case Instruction::ZExt:
6818 case Instruction::SExt:
6819 case Instruction::FPToUI:
6820 case Instruction::FPToSI:
6821 case Instruction::FPExt:
6822 case Instruction::PtrToInt:
6823 case Instruction::IntToPtr:
6824 case Instruction::SIToFP:
6825 case Instruction::UIToFP:
6826 case Instruction::Trunc:
6827 case Instruction::FPTrunc:
6828 case Instruction::BitCast: {
6829 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6830 std::make_pair(std::numeric_limits<unsigned>::min(),
6831 std::numeric_limits<unsigned>::max()));
6832 if (ShuffleOrOp == Instruction::ZExt ||
6833 ShuffleOrOp == Instruction::SExt) {
6834 CastMaxMinBWSizes = std::make_pair(
6835 std::max<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6838 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6840 }
else if (ShuffleOrOp == Instruction::Trunc) {
6841 CastMaxMinBWSizes = std::make_pair(
6843 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6845 std::min<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6847 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6848 }
else if (ShuffleOrOp == Instruction::SIToFP ||
6849 ShuffleOrOp == Instruction::UIToFP) {
6850 unsigned NumSignBits =
6852 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6854 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
6856 if (NumSignBits * 2 >=
6857 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6858 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6860 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6861 ReuseShuffleIndicies);
6864 TE->setOperandsInOrder();
6865 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6869 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6875 case Instruction::ICmp:
6876 case Instruction::FCmp: {
6879 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6880 ReuseShuffleIndicies);
6888 "Commutative Predicate mismatch");
6889 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6892 for (
Value *V : VL) {
6893 auto *
Cmp = cast<CmpInst>(V);
6896 if (
Cmp->getPredicate() != P0)
6906 if (ShuffleOrOp == Instruction::ICmp) {
6907 unsigned NumSignBits0 =
6909 if (NumSignBits0 * 2 >=
6910 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6911 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
6912 unsigned NumSignBits1 =
6914 if (NumSignBits1 * 2 >=
6915 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6916 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
6920 case Instruction::Select:
6921 case Instruction::FNeg:
6922 case Instruction::Add:
6923 case Instruction::FAdd:
6924 case Instruction::Sub:
6925 case Instruction::FSub:
6926 case Instruction::Mul:
6927 case Instruction::FMul:
6928 case Instruction::UDiv:
6929 case Instruction::SDiv:
6930 case Instruction::FDiv:
6931 case Instruction::URem:
6932 case Instruction::SRem:
6933 case Instruction::FRem:
6934 case Instruction::Shl:
6935 case Instruction::LShr:
6936 case Instruction::AShr:
6937 case Instruction::And:
6938 case Instruction::Or:
6939 case Instruction::Xor: {
6940 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6941 ReuseShuffleIndicies);
6948 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6956 TE->setOperandsInOrder();
6957 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6961 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6967 case Instruction::GetElementPtr: {
6968 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6969 ReuseShuffleIndicies);
6973 for (
Value *V : VL) {
6974 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6979 Operands.front().push_back(
GEP->getPointerOperand());
6988 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6990 [VL0Ty, IndexIdx](
Value *V) {
6991 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6994 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
6997 :
DL->getIndexType(cast<GetElementPtrInst>(VL0)
6998 ->getPointerOperandType()
7001 for (
Value *V : VL) {
7002 auto *
I = dyn_cast<GetElementPtrInst>(V);
7005 ConstantInt::get(Ty, 0,
false));
7008 auto *
Op =
I->getOperand(IndexIdx);
7009 auto *CI = dyn_cast<ConstantInt>(
Op);
7014 CI, Ty, CI->getValue().isSignBitSet(), *
DL));
7018 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7022 case Instruction::Store: {
7026 for (
Value *V : VL) {
7027 auto *
SI = cast<StoreInst>(V);
7028 *OIter =
SI->getValueOperand();
7032 if (CurrentOrder.empty()) {
7034 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7035 ReuseShuffleIndicies);
7036 TE->setOperandsInOrder();
7041 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7042 ReuseShuffleIndicies, CurrentOrder);
7043 TE->setOperandsInOrder();
7045 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7049 case Instruction::Call: {
7052 CallInst *CI = cast<CallInst>(VL0);
7055 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7056 ReuseShuffleIndicies);
7061 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7065 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7069 for (
Value *V : VL) {
7070 auto *CI2 = cast<CallInst>(V);
7077 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7084 TE->setOperandsInOrder();
7085 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7092 for (
Value *V : VL) {
7093 auto *CI2 = cast<CallInst>(V);
7100 case Instruction::ShuffleVector: {
7101 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7102 ReuseShuffleIndicies);
7106 auto *CI = dyn_cast<CmpInst>(VL0);
7107 if (isa<BinaryOperator>(VL0) || CI) {
7110 return cast<CmpInst>(V)->isCommutative();
7112 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7114 auto *MainCI = cast<CmpInst>(S.MainOp);
7115 auto *AltCI = cast<CmpInst>(S.AltOp);
7119 "Expected different main/alternate predicates.");
7122 for (
Value *V : VL) {
7123 auto *
Cmp = cast<CmpInst>(V);
7145 TE->setOperandsInOrder();
7146 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7150 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7166 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7167 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7169 for (
const auto *Ty : ST->elements())
7170 if (Ty != *ST->element_begin())
7172 N *= ST->getNumElements();
7173 EltTy = *ST->element_begin();
7174 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7175 N *= AT->getNumElements();
7176 EltTy = AT->getElementType();
7178 auto *VT = cast<FixedVectorType>(EltTy);
7179 N *= VT->getNumElements();
7180 EltTy = VT->getElementType();
7187 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7188 VTSize !=
DL->getTypeStoreSizeInBits(
T))
7195 bool ResizeAllowed)
const {
7196 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7197 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7198 auto *E0 = cast<Instruction>(*It);
7200 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7204 Value *Vec = E0->getOperand(0);
7206 CurrentOrder.
clear();
7210 if (E0->getOpcode() == Instruction::ExtractValue) {
7215 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7219 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7222 unsigned E = VL.
size();
7223 if (!ResizeAllowed && NElts !=
E)
7226 unsigned MinIdx = NElts, MaxIdx = 0;
7228 auto *Inst = dyn_cast<Instruction>(V);
7231 if (Inst->getOperand(0) != Vec)
7233 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7234 if (isa<UndefValue>(EE->getIndexOperand()))
7239 const unsigned ExtIdx = *
Idx;
7240 if (ExtIdx >= NElts)
7242 Indices[
I] = ExtIdx;
7243 if (MinIdx > ExtIdx)
7245 if (MaxIdx < ExtIdx)
7248 if (MaxIdx - MinIdx + 1 >
E)
7250 if (MaxIdx + 1 <=
E)
7254 bool ShouldKeepOrder =
true;
7261 for (
unsigned I = 0;
I <
E; ++
I) {
7264 const unsigned ExtIdx = Indices[
I] - MinIdx;
7265 if (CurrentOrder[ExtIdx] !=
E) {
7266 CurrentOrder.
clear();
7269 ShouldKeepOrder &= ExtIdx ==
I;
7270 CurrentOrder[ExtIdx] =
I;
7272 if (ShouldKeepOrder)
7273 CurrentOrder.
clear();
7275 return ShouldKeepOrder;
7278bool BoUpSLP::areAllUsersVectorized(
7280 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7282 return ScalarToTreeEntry.contains(U) ||
7283 isVectorLikeInstWithConstOps(U) ||
7284 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7288static std::pair<InstructionCost, InstructionCost>
7296 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7297 FMF = FPCI->getFastMathFlags();
7300 dyn_cast<IntrinsicInst>(CI));
7301 auto IntrinsicCost =
7308 auto LibCost = IntrinsicCost;
7315 return {IntrinsicCost, LibCost};
7318void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7322 unsigned Sz = Scalars.size();
7325 if (!ReorderIndices.empty())
7327 for (
unsigned I = 0;
I < Sz; ++
I) {
7329 if (!ReorderIndices.empty())
7331 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7332 if (IsAltOp(OpInst)) {
7342 if (!ReuseShuffleIndices.empty()) {
7345 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7355 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7356 auto *AltCI = cast<CmpInst>(AltOp);
7359 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7360 auto *CI = cast<CmpInst>(
I);
7368 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7369 "CmpInst expected to match either main or alternate predicate or "
7372 return MainP !=
P && MainP != SwappedP;
7379 const auto *Op0 = Ops.
front();
7385 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7389 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7391 if (
auto *CI = dyn_cast<ConstantInt>(V))
7392 return CI->getValue().isPowerOf2();
7395 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7397 if (
auto *CI = dyn_cast<ConstantInt>(V))
7398 return CI->getValue().isNegatedPowerOf2();
7403 if (IsConstant && IsUniform)
7405 else if (IsConstant)
7419class BaseShuffleAnalysis {
7426 int Limit =
Mask.size();
7438 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7454 unsigned VF =
Mask.size();
7456 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7459 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7500 bool SinglePermute) {
7504 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7506 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7512 if (isIdentityMask(Mask, SVTy,
false)) {
7513 if (!IdentityOp || !SinglePermute ||
7514 (isIdentityMask(Mask, SVTy,
true) &&
7516 IdentityMask.
size()))) {
7521 IdentityMask.
assign(Mask);
7541 if (SV->isZeroEltSplat()) {
7543 IdentityMask.
assign(Mask);
7545 int LocalVF =
Mask.size();
7547 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7548 LocalVF = SVOpTy->getNumElements();
7552 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7554 ExtMask[
Idx] = SV->getMaskValue(
I);
7564 if (!IsOp1Undef && !IsOp2Undef) {
7566 for (
int &
I : Mask) {
7569 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7576 SV->getShuffleMask().end());
7577 combineMasks(LocalVF, ShuffleMask, Mask);
7578 Mask.swap(ShuffleMask);
7580 Op = SV->getOperand(0);
7582 Op = SV->getOperand(1);
7584 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7585 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7590 "Expected masks of same sizes.");
7595 Mask.swap(IdentityMask);
7596 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7597 return SinglePermute &&
7598 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7600 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7601 Shuffle->isZeroEltSplat() &&
7614 template <
typename T,
typename ShuffleBuilderTy>
7616 ShuffleBuilderTy &Builder) {
7617 assert(V1 &&
"Expected at least one vector value.");
7619 Builder.resizeToMatch(V1, V2);
7620 int VF =
Mask.size();
7621 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7622 VF = FTy->getNumElements();
7629 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7632 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7634 CombinedMask1[
I] =
Mask[
I];
7636 CombinedMask2[
I] =
Mask[
I] - VF;
7643 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7644 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7647 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7648 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7653 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7656 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7658 ExtMask1, UseMask::SecondArg);
7663 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7666 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7668 ExtMask2, UseMask::SecondArg);
7669 if (SV1->getOperand(0)->getType() ==
7670 SV2->getOperand(0)->getType() &&
7671 SV1->getOperand(0)->getType() != SV1->getType() &&
7674 Op1 = SV1->getOperand(0);
7675 Op2 = SV2->getOperand(0);
7677 SV1->getShuffleMask().end());
7678 int LocalVF = ShuffleMask1.size();
7679 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7680 LocalVF = FTy->getNumElements();
7681 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7682 CombinedMask1.swap(ShuffleMask1);
7684 SV2->getShuffleMask().end());
7685 LocalVF = ShuffleMask2.size();
7686 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7687 LocalVF = FTy->getNumElements();
7688 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7689 CombinedMask2.swap(ShuffleMask2);
7692 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7693 Builder.resizeToMatch(Op1, Op2);
7694 VF = std::max(cast<VectorType>(Op1->
getType())
7696 .getKnownMinValue(),
7697 cast<VectorType>(Op2->
getType())
7699 .getKnownMinValue());
7700 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7703 "Expected undefined mask element");
7704 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7710 isa<ShuffleVectorInst>(Op1) &&
7711 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7713 return Builder.createIdentity(Op1);
7714 return Builder.createShuffleVector(
7718 if (isa<PoisonValue>(V1))
7719 return Builder.createPoison(
7720 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7722 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7723 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7726 return Builder.createShuffleVector(V1, NewMask);
7727 return Builder.createIdentity(V1);
7743 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7746 Mask, NumSrcElts, NumSubElts,
Index)) {
7747 if (
Index + NumSubElts > NumSrcElts &&
7748 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7758static std::pair<InstructionCost, InstructionCost>
7769 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7779 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7783 for (
Value *V : Ptrs) {
7788 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7793 if (!
Ptr || !
Ptr->hasOneUse())
7797 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7803 TTI::PointersChainInfo::getKnownStride(),
7813 [](
const Value *V) {
7814 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7815 return Ptr && !
Ptr->hasAllConstantIndices();
7817 ? TTI::PointersChainInfo::getUnknownStride()
7818 : TTI::PointersChainInfo::getKnownStride();
7822 if (
auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7825 BaseGEP->getPointerOperand(), Indices, VecTy,
7830 return std::make_pair(ScalarCost, VecCost);
7835 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7836 TreeEntry &E = *TE.get();
7837 switch (E.getOpcode()) {
7838 case Instruction::Load: {
7839 Type *ScalarTy = E.getMainOp()->getType();
7841 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7848 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7855 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7856 false, CommonAlignment,
CostKind, BaseLI);
7857 if (StridedCost < OriginalVecCost)
7860 E.State = TreeEntry::StridedVectorize;
7877 bool IsFinalized =
false;
7880 Type *ScalarTy =
nullptr;
7891 bool SameNodesEstimated =
true;
7900 if (
auto *VTy = dyn_cast<VectorType>(Ty))
7916 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
7917 unsigned MinVF = R.getMinVF(2 * Sz);
7918 if (VL.
size() > 2 &&
7919 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7920 (InVectors.
empty() &&
7923 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7924 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7925 return S.getOpcode() == Instruction::Load &&
7928 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
7934 unsigned StartIdx = 0;
7935 unsigned VF = VL.
size() / 2;
7936 for (; VF >= MinVF; VF /= 2) {
7937 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
7940 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7942 if (SliceS.getOpcode() != Instruction::Load ||
7943 SliceS.isAltShuffle())
7951 CurrentOrder, PointerOps);
7961 CurrentOrder.
empty()) ||
7970 if (Cnt == StartIdx)
7979 if (StartIdx >= VL.
size())
7982 if (!VectorizedLoads.
empty())
7985 if (!VectorizedLoads.
empty()) {
7987 bool NeedInsertSubvectorAnalysis =
7988 !NumParts || (VL.
size() / VF) > NumParts;
7994 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
8001 for (
Value *V : VectorizedLoads) {
8002 auto *LI = cast<LoadInst>(V);
8009 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
8010 auto *LI = cast<LoadInst>(VL[
P.first]);
8019 false, Alignment, CostKind, LI);
8023 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8024 auto [ScalarGEPCost, VectorGEPCost] =
8026 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8027 GatherCost += VectorGEPCost - ScalarGEPCost;
8029 for (
unsigned P : ScatterVectorized) {
8030 auto *LI0 = cast<LoadInst>(VL[
P]);
8032 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8034 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8035 false, CommonAlignment, CostKind, LI0);
8039 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8047 auto [ScalarGEPCost, VectorGEPCost] =
8049 CostKind, ScalarTy, VecTy);
8050 GatherCost += VectorGEPCost - ScalarGEPCost;
8051 if (!Order.
empty()) {
8055 VecTy, Mask, CostKind);
8058 GatherCost += R.getGatherCost(PointerOps,
true,
8059 PointerOps.
front()->getType());
8062 if (NeedInsertSubvectorAnalysis) {
8065 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8066 for (
unsigned Idx : seq<unsigned>(0, E))
8069 ShuffleMask, CostKind,
I, LoadTy);
8072 GatherCost -= ScalarsCost;
8074 GatherCost = std::min(BaseCost, GatherCost);
8075 }
else if (!Root &&
isSplat(VL)) {
8078 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8079 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8082 count(VL, *It) > 1 &&
8086 CostKind, std::distance(VL.
begin(), It),
8091 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8097 VecTy, ShuffleMask, CostKind,
8102 (
all_of(Gathers, IsaPred<UndefValue>)
8104 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
8112 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8113 unsigned NumParts) {
8114 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8116 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8117 auto *EE = dyn_cast<ExtractElementInst>(V);
8120 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8123 return std::max(Sz, VecTy->getNumElements());
8125 unsigned NumSrcRegs =
8127 if (NumSrcRegs == 0)
8132 auto CheckPerRegistersShuffle =
8137 int FirstRegId = -1;
8138 for (
int &
I : Mask) {
8141 int RegId = (
I / NumElts) * NumParts + (
I % NumElts) / EltsPerVector;
8144 RegIndices.
insert(RegId);
8145 if (RegIndices.
size() > 2)
8146 return std::nullopt;
8147 if (RegIndices.
size() == 2)
8149 I = (
I % NumElts) % EltsPerVector +
8150 (RegId == FirstRegId ? 0 : EltsPerVector);
8159 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8160 if (!ShuffleKinds[Part])
8163 Mask.slice(Part * EltsPerVector,
8164 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8165 ? Mask.size() % EltsPerVector
8169 std::optional<TTI::ShuffleKind> RegShuffleKind =
8170 CheckPerRegistersShuffle(SubMask);
8171 if (!RegShuffleKind) {
8190 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8197 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8199 unsigned SliceSize) {
8200 if (SameNodesEstimated) {
8206 if ((InVectors.
size() == 2 &&
8207 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8208 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8209 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8212 "Expected all poisoned elements.");
8215 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8220 Cost += createShuffle(InVectors.
front(),
8221 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8223 transformMaskAfterShuffle(CommonMask, CommonMask);
8225 SameNodesEstimated =
false;
8226 if (!E2 && InVectors.
size() == 1) {
8227 unsigned VF = E1.getVectorFactor();
8230 cast<FixedVectorType>(V1->
getType())->getNumElements());
8232 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8233 VF = std::max(VF, E->getVectorFactor());
8235 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8237 CommonMask[
Idx] = Mask[
Idx] + VF;
8238 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8239 transformMaskAfterShuffle(CommonMask, CommonMask);
8241 Cost += createShuffle(&E1, E2, Mask);
8242 transformMaskAfterShuffle(CommonMask, Mask);
8246 class ShuffleCostBuilder {
8249 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8251 return Mask.empty() ||
8252 (VF == Mask.size() &&
8260 ~ShuffleCostBuilder() =
default;
8265 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8266 if (isEmptyOrIdentity(Mask, VF))
8269 cast<VectorType>(V1->
getType()), Mask);
8274 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8275 if (isEmptyOrIdentity(Mask, VF))
8278 cast<VectorType>(V1->
getType()), Mask);
8284 void resizeToMatch(
Value *&,
Value *&)
const {}
8294 ShuffleCostBuilder Builder(
TTI);
8297 unsigned CommonVF = Mask.size();
8299 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
8301 if (E.State == TreeEntry::NeedToGather &&
allConstant(E.Scalars))
8303 Type *EScalarTy = E.Scalars.front()->getType();
8304 bool IsSigned =
true;
8305 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8307 IsSigned = It->second.second;
8309 if (EScalarTy != ScalarTy) {
8310 unsigned CastOpcode = Instruction::Trunc;
8311 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8312 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8314 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8323 if (isa<Constant>(V))
8325 auto *VecTy = cast<VectorType>(V->getType());
8327 if (EScalarTy != ScalarTy) {
8329 unsigned CastOpcode = Instruction::Trunc;
8330 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8331 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8333 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8340 if (!V1 && !V2 && !P2.
isNull()) {
8342 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8343 unsigned VF = E->getVectorFactor();
8344 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8345 CommonVF = std::max(VF, E2->getVectorFactor());
8348 return Idx < 2 * static_cast<int>(CommonVF);
8350 "All elements in mask must be less than 2 * CommonVF.");
8351 if (E->Scalars.size() == E2->Scalars.size()) {
8355 for (
int &
Idx : CommonMask) {
8358 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8360 else if (
Idx >=
static_cast<int>(CommonVF))
8361 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8365 CommonVF = E->Scalars.size();
8366 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8367 GetNodeMinBWAffectedCost(*E2, CommonVF);
8369 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8370 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8374 }
else if (!V1 && P2.
isNull()) {
8376 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8377 unsigned VF = E->getVectorFactor();
8381 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8382 "All elements in mask must be less than CommonVF.");
8383 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8385 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8386 for (
int &
Idx : CommonMask) {
8390 CommonVF = E->Scalars.size();
8392 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8395 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8396 CommonVF == CommonMask.
size() &&
8398 [](
const auto &&
P) {
8400 static_cast<unsigned>(
P.value()) !=
P.index();
8408 }
else if (V1 && P2.
isNull()) {
8410 ExtraCost += GetValueMinBWAffectedCost(V1);
8411 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8414 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8415 "All elements in mask must be less than CommonVF.");
8416 }
else if (V1 && !V2) {
8418 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8419 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8420 CommonVF = std::max(VF, E2->getVectorFactor());
8423 return Idx < 2 * static_cast<int>(CommonVF);
8425 "All elements in mask must be less than 2 * CommonVF.");
8426 if (E2->Scalars.size() == VF && VF != CommonVF) {
8428 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8429 for (
int &
Idx : CommonMask) {
8432 if (
Idx >=
static_cast<int>(CommonVF))
8433 Idx = E2Mask[
Idx - CommonVF] + VF;
8437 ExtraCost += GetValueMinBWAffectedCost(V1);
8439 ExtraCost += GetNodeMinBWAffectedCost(
8440 *E2, std::min(CommonVF, E2->getVectorFactor()));
8442 }
else if (!V1 && V2) {
8444 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8445 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8446 CommonVF = std::max(VF, E1->getVectorFactor());
8449 return Idx < 2 * static_cast<int>(CommonVF);
8451 "All elements in mask must be less than 2 * CommonVF.");
8452 if (E1->Scalars.size() == VF && VF != CommonVF) {
8454 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8455 for (
int &
Idx : CommonMask) {
8458 if (
Idx >=
static_cast<int>(CommonVF))
8459 Idx = E1Mask[
Idx - CommonVF] + VF;
8465 ExtraCost += GetNodeMinBWAffectedCost(
8466 *E1, std::min(CommonVF, E1->getVectorFactor()));
8468 ExtraCost += GetValueMinBWAffectedCost(V2);
8471 assert(V1 && V2 &&
"Expected both vectors.");
8472 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8474 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8477 return Idx < 2 * static_cast<int>(CommonVF);
8479 "All elements in mask must be less than 2 * CommonVF.");
8481 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8482 if (V1->
getType() != V2->getType()) {
8486 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
8488 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8494 if (InVectors.
size() == 2)
8496 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8497 V1, V2, CommonMask, Builder);
8504 : ScalarTy(ScalarTy),
TTI(
TTI),
8505 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8506 CheckedExtracts(CheckedExtracts) {}
8508 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8509 unsigned NumParts,
bool &UseVecBaseAsInput) {
8510 UseVecBaseAsInput =
false;
8513 Value *VecBase =
nullptr;
8516 if (NumParts == VL.
size())
8520 bool PrevNodeFound =
any_of(
8522 [&](
const std::unique_ptr<TreeEntry> &TE) {
8523 return ((!TE->isAltShuffle() &&
8524 TE->getOpcode() == Instruction::ExtractElement) ||
8525 TE->State == TreeEntry::NeedToGather) &&
8526 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8527 return VL.size() > Data.index() &&
8528 (Mask[Data.index()] == PoisonMaskElem ||
8529 isa<UndefValue>(VL[Data.index()]) ||
8530 Data.value() == VL[Data.index()]);
8534 unsigned SliceSize = VL.
size() / NumParts;
8535 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8536 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8537 for (
auto [
I, V] :
enumerate(VL.
slice(Part * SliceSize, SliceSize))) {
8539 if (isa<UndefValue>(V) ||
8548 auto *EE = cast<ExtractElementInst>(V);
8549 VecBase = EE->getVectorOperand();
8550 UniqueBases.
insert(VecBase);
8551 const TreeEntry *VE = R.getTreeEntry(V);
8552 if (!CheckedExtracts.
insert(V).second ||
8553 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8556 return isa<GetElementPtrInst>(U) &&
8557 !R.areAllUsersVectorized(cast<Instruction>(U),
8565 unsigned Idx = *EEIdx;
8567 if (EE->hasOneUse() || !PrevNodeFound) {
8569 if (isa<SExtInst, ZExtInst>(Ext) &&
8570 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8575 EE->getVectorOperandType(),
Idx);
8578 Ext->getOpcode(), Ext->getType(), EE->getType(),
8594 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8597 transformMaskAfterShuffle(CommonMask, CommonMask);
8598 SameNodesEstimated =
false;
8599 if (NumParts != 1 && UniqueBases.
size() != 1) {
8600 UseVecBaseAsInput =
true;
8608 std::optional<InstructionCost>
8612 return std::nullopt;
8618 return Idx < static_cast<int>(E1.getVectorFactor());
8620 "Expected single vector shuffle mask.");
8624 if (InVectors.
empty()) {
8625 CommonMask.
assign(Mask.begin(), Mask.end());
8626 InVectors.
assign({&E1, &E2});
8629 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8632 if (NumParts == 0 || NumParts >= Mask.size())
8634 unsigned SliceSize = Mask.size() / NumParts;
8637 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8638 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8641 if (InVectors.
empty()) {
8642 CommonMask.
assign(Mask.begin(), Mask.end());
8643 InVectors.
assign(1, &E1);
8646 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8649 if (NumParts == 0 || NumParts >= Mask.size())
8651 unsigned SliceSize = Mask.size() / NumParts;
8654 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8655 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8656 if (!SameNodesEstimated && InVectors.
size() == 1)
8669 cast<ExtractElementInst>(InVectors.
front()
8670 .get<
const TreeEntry *>()
8671 ->Scalars[
P.index()]);
8672 return EI->getVectorOperand() == V1 ||
8673 EI->getVectorOperand() == V2;
8675 "Expected extractelement vectors.");
8679 if (InVectors.
empty()) {
8681 "Expected empty input mask/vectors.");
8682 CommonMask.
assign(Mask.begin(), Mask.end());
8689 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8693 .get<const TreeEntry *>()
8694 ->Scalars[
P.index()];
8696 return P.value() == Mask[
P.index()] ||
8697 isa<UndefValue>(Scalar);
8698 if (isa<Constant>(V1))
8700 auto *EI = cast<ExtractElementInst>(Scalar);
8701 return EI->getVectorOperand() == V1;
8703 "Expected only tree entry for extractelement vectors.");
8707 "Expected only tree entries from extracts/reused buildvectors.");
8708 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8709 if (InVectors.
size() == 2) {
8710 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8711 transformMaskAfterShuffle(CommonMask, CommonMask);
8712 VF = std::max<unsigned>(VF, CommonMask.
size());
8713 }
else if (
const auto *InTE =
8714 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8715 VF = std::max(VF, InTE->getVectorFactor());
8719 ->getNumElements());
8722 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8724 CommonMask[
Idx] = Mask[
Idx] + VF;
8727 Value *Root =
nullptr) {
8728 Cost += getBuildVectorCost(VL, Root);
8732 unsigned VF = VL.
size();
8734 VF = std::min(VF, MaskVF);
8736 if (isa<UndefValue>(V)) {
8746 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8747 getAllOnesValue(*R.DL, ScalarTy));
8757 if (InVectors.
size() == 2)
8758 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8760 Cost += createShuffle(Vec,
nullptr, CommonMask);
8761 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8765 "Expected vector length for the final value before action.");
8767 Action(V, CommonMask);
8768 InVectors.
front() = V;
8771 if (CommonMask.
empty()) {
8772 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
8776 createShuffle(InVectors.
front(),
8777 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
8783 "Shuffle construction must be finalized.");
8787const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
8788 unsigned Idx)
const {
8790 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
8791 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8792 return EI.EdgeIdx == Idx && EI.UserTE == E;
8793 }) != TE->UserTreeIndices.end())
8795 auto MIt = MultiNodeScalars.
find(
Op);
8796 if (MIt != MultiNodeScalars.
end()) {
8797 for (
const TreeEntry *TE : MIt->second) {
8798 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8799 return EI.EdgeIdx == Idx && EI.UserTE == E;
8800 }) != TE->UserTreeIndices.end())
8806 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8807 return TE->State == TreeEntry::NeedToGather &&
8808 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8809 return EI.EdgeIdx == Idx && EI.UserTE == E;
8810 }) !=
TE->UserTreeIndices.end();
8812 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
8817 if (
TE.State == TreeEntry::ScatterVectorize ||
8818 TE.State == TreeEntry::StridedVectorize)
8820 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
8821 !
TE.isAltShuffle()) {
8822 if (
TE.ReorderIndices.empty())
8861 Type *ScalarTy = VL[0]->getType();
8862 if (E->State != TreeEntry::NeedToGather) {
8863 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
8864 ScalarTy =
SI->getValueOperand()->getType();
8865 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
8867 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8868 ScalarTy =
IE->getOperand(1)->getType();
8877 auto It = MinBWs.
find(E);
8878 Type *OrigScalarTy = ScalarTy;
8879 if (It != MinBWs.
end()) {
8883 unsigned EntryVF = E->getVectorFactor();
8886 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8887 if (E->State == TreeEntry::NeedToGather) {
8890 if (isa<InsertElementInst>(VL[0]))
8892 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8893 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
8898 if (!E->ReorderIndices.empty() &&
8899 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8901 if (E->getOpcode() == Instruction::Store) {
8903 NewMask.
resize(E->ReorderIndices.size());
8904 copy(E->ReorderIndices, NewMask.
begin());
8910 if (NeedToShuffleReuses)
8911 ::addMask(Mask, E->ReuseShuffleIndices);
8915 assert((E->State == TreeEntry::Vectorize ||
8916 E->State == TreeEntry::ScatterVectorize ||
8917 E->State == TreeEntry::StridedVectorize) &&
8921 (E->getOpcode() == Instruction::GetElementPtr &&
8922 E->getMainOp()->getType()->isPointerTy())) &&
8925 unsigned ShuffleOrOp =
8926 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
8928 const unsigned Sz = UniqueValues.
size();
8930 for (
unsigned I = 0;
I < Sz; ++
I) {
8931 if (getTreeEntry(UniqueValues[
I]) == E)
8935 auto GetCastContextHint = [&](
Value *
V) {
8936 if (
const TreeEntry *OpTE = getTreeEntry(V))
8937 return getCastContextHint(*OpTE);
8938 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
8939 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8948 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8952 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8954 for (
unsigned I = 0;
I < Sz; ++
I) {
8955 if (UsedScalars.test(
I))
8957 ScalarCost += ScalarEltCost(
I);
8965 const EdgeInfo &EI = E->UserTreeIndices.front();
8966 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8968 It != MinBWs.
end()) {
8969 auto UserBWIt = MinBWs.
find(EI.UserTE);
8970 Type *UserScalarTy =
8971 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8972 if (UserBWIt != MinBWs.
end())
8974 UserBWIt->second.first);
8975 if (ScalarTy != UserScalarTy) {
8976 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
8977 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
8982 VecOpcode = Instruction::Trunc;
8985 It->second.second ? Instruction::SExt : Instruction::ZExt;
8992 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8993 ScalarCost,
"Calculated costs for Tree"));
8994 return VecCost - ScalarCost;
8999 assert((E->State == TreeEntry::Vectorize ||
9000 E->State == TreeEntry::StridedVectorize) &&
9001 "Entry state expected to be Vectorize or StridedVectorize here.");
9005 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
9006 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9007 "Calculated GEPs cost for Tree"));
9009 return VecCost - ScalarCost;
9012 switch (ShuffleOrOp) {
9013 case Instruction::PHI: {
9017 for (
Value *V : UniqueValues) {
9018 auto *
PHI = dyn_cast<PHINode>(V);
9023 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
9027 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
9029 if (!OpTE->ReuseShuffleIndices.empty())
9030 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9031 OpTE->Scalars.size());
9034 return CommonCost - ScalarCost;
9036 case Instruction::ExtractValue:
9037 case Instruction::ExtractElement: {
9038 auto GetScalarCost = [&](
unsigned Idx) {
9039 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
9041 if (ShuffleOrOp == Instruction::ExtractElement) {
9042 auto *EE = cast<ExtractElementInst>(
I);
9043 SrcVecTy = EE->getVectorOperandType();
9045 auto *EV = cast<ExtractValueInst>(
I);
9046 Type *AggregateTy = EV->getAggregateOperand()->getType();
9048 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9049 NumElts = ATy->getNumElements();
9054 if (
I->hasOneUse()) {
9056 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9057 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
9064 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9072 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9073 return GetCostDiff(GetScalarCost, GetVectorCost);
9075 case Instruction::InsertElement: {
9076 assert(E->ReuseShuffleIndices.empty() &&
9077 "Unique insertelements only are expected.");
9078 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9079 unsigned const NumElts = SrcVecTy->getNumElements();
9080 unsigned const NumScalars = VL.
size();
9086 unsigned OffsetEnd = OffsetBeg;
9087 InsertMask[OffsetBeg] = 0;
9090 if (OffsetBeg >
Idx)
9092 else if (OffsetEnd <
Idx)
9094 InsertMask[
Idx] =
I + 1;
9098 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9099 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9101 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9102 unsigned InsertVecSz = std::min<unsigned>(
9104 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9105 bool IsWholeSubvector =
9106 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9110 if (OffsetBeg + InsertVecSz > VecSz) {
9113 InsertVecSz = VecSz;
9119 if (!E->ReorderIndices.empty()) {
9124 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9126 bool IsIdentity =
true;
9128 Mask.swap(PrevMask);
9129 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9131 DemandedElts.
setBit(InsertIdx);
9132 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9133 Mask[InsertIdx - OffsetBeg] =
I;
9135 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9150 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9151 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9159 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9160 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9161 if (InsertVecSz != VecSz) {
9173 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9182 case Instruction::ZExt:
9183 case Instruction::SExt:
9184 case Instruction::FPToUI:
9185 case Instruction::FPToSI:
9186 case Instruction::FPExt:
9187 case Instruction::PtrToInt:
9188 case Instruction::IntToPtr:
9189 case Instruction::SIToFP:
9190 case Instruction::UIToFP:
9191 case Instruction::Trunc:
9192 case Instruction::FPTrunc:
9193 case Instruction::BitCast: {
9194 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9197 unsigned Opcode = ShuffleOrOp;
9198 unsigned VecOpcode = Opcode;
9200 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9202 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9203 if (SrcIt != MinBWs.
end()) {
9204 SrcBWSz = SrcIt->second.first;
9208 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9209 if (BWSz == SrcBWSz) {
9210 VecOpcode = Instruction::BitCast;
9211 }
else if (BWSz < SrcBWSz) {
9212 VecOpcode = Instruction::Trunc;
9213 }
else if (It != MinBWs.
end()) {
9214 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9215 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9216 }
else if (SrcIt != MinBWs.
end()) {
9217 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9219 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9221 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9222 !SrcIt->second.second) {
9223 VecOpcode = Instruction::UIToFP;
9226 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9234 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9236 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9240 VecOpcode == Opcode ? VI :
nullptr);
9242 return GetCostDiff(GetScalarCost, GetVectorCost);
9244 case Instruction::FCmp:
9245 case Instruction::ICmp:
9246 case Instruction::Select: {
9250 match(VL0, MatchCmp))
9256 auto GetScalarCost = [&](
unsigned Idx) {
9257 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9263 !
match(VI, MatchCmp)) ||
9264 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9270 Builder.getInt1Ty(), CurrentPred,
CostKind,
9277 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9289 if (IntrinsicAndUse.second)
9292 VecCost = std::min(VecCost, IntrinsicCost);
9294 return VecCost + CommonCost;
9296 return GetCostDiff(GetScalarCost, GetVectorCost);
9298 case Instruction::FNeg:
9299 case Instruction::Add:
9300 case Instruction::FAdd:
9301 case Instruction::Sub:
9302 case Instruction::FSub:
9303 case Instruction::Mul:
9304 case Instruction::FMul:
9305 case Instruction::UDiv:
9306 case Instruction::SDiv:
9307 case Instruction::FDiv:
9308 case Instruction::URem:
9309 case Instruction::SRem:
9310 case Instruction::FRem:
9311 case Instruction::Shl:
9312 case Instruction::LShr:
9313 case Instruction::AShr:
9314 case Instruction::And:
9315 case Instruction::Or:
9316 case Instruction::Xor: {
9317 auto GetScalarCost = [&](
unsigned Idx) {
9318 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9319 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9328 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9332 Op2Info, std::nullopt,
nullptr, TLI) +
9335 return GetCostDiff(GetScalarCost, GetVectorCost);
9337 case Instruction::GetElementPtr: {
9338 return CommonCost + GetGEPCostDiff(VL, VL0);
9340 case Instruction::Load: {
9341 auto GetScalarCost = [&](
unsigned Idx) {
9342 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9344 VI->getAlign(),
VI->getPointerAddressSpace(),
9347 auto *LI0 = cast<LoadInst>(VL0);
9350 if (E->State == TreeEntry::Vectorize) {
9352 Instruction::Load, VecTy, LI0->getAlign(),
9354 }
else if (E->State == TreeEntry::StridedVectorize) {
9355 Align CommonAlignment =
9356 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9358 Instruction::Load, VecTy, LI0->getPointerOperand(),
9361 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9362 Align CommonAlignment =
9363 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9365 Instruction::Load, VecTy, LI0->getPointerOperand(),
9368 return VecLdCost + CommonCost;
9374 if (E->State == TreeEntry::ScatterVectorize)
9380 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9381 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9383 case Instruction::Store: {
9384 bool IsReorder = !E->ReorderIndices.empty();
9385 auto GetScalarCost = [=](
unsigned Idx) {
9386 auto *
VI = cast<StoreInst>(VL[
Idx]);
9389 VI->getAlign(),
VI->getPointerAddressSpace(),
9393 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9398 BaseSI->getPointerAddressSpace(),
CostKind,
9404 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9405 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9408 return GetCostDiff(GetScalarCost, GetVectorCost) +
9409 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9411 case Instruction::Call: {
9412 auto GetScalarCost = [&](
unsigned Idx) {
9413 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9424 auto *CI = cast<CallInst>(VL0);
9428 It != MinBWs.
end() ? It->second.first : 0);
9430 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9432 return GetCostDiff(GetScalarCost, GetVectorCost);
9434 case Instruction::ShuffleVector: {
9435 assert(E->isAltShuffle() &&
9440 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9441 "Invalid Shuffle Vector Operand");
9444 auto TryFindNodeWithEqualOperands = [=]() {
9445 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9448 if (
TE->isAltShuffle() &&
9449 ((
TE->getOpcode() == E->getOpcode() &&
9450 TE->getAltOpcode() == E->getAltOpcode()) ||
9451 (
TE->getOpcode() == E->getAltOpcode() &&
9452 TE->getAltOpcode() == E->getOpcode())) &&
9453 TE->hasEqualOperands(*E))
9458 auto GetScalarCost = [&](
unsigned Idx) {
9459 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9460 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9470 if (TryFindNodeWithEqualOperands()) {
9472 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9479 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9481 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9482 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9484 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9485 CI0->getPredicate(),
CostKind, VL0);
9486 VecCost += TTIRef.getCmpSelInstrCost(
9487 E->getOpcode(), VecTy, MaskTy,
9488 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9491 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9494 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9495 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9497 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9498 if (SrcIt != MinBWs.
end()) {
9499 SrcBWSz = SrcIt->second.first;
9503 if (BWSz <= SrcBWSz) {
9506 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9510 <<
"SLP: alternate extension, which should be truncated.\n";
9516 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9519 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9523 E->buildAltOpShuffleMask(
9525 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9526 return I->getOpcode() == E->getAltOpcode();
9535 unsigned Opcode0 = E->getOpcode();
9536 unsigned Opcode1 = E->getAltOpcode();
9539 for (
unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9540 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9541 OpcodeMask.set(Lane);
9544 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9546 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9547 return AltVecCost < VecCost ? AltVecCost : VecCost;
9552 return GetCostDiff(GetScalarCost, GetVectorCost);
9559bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9561 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9563 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9565 return TE->State == TreeEntry::NeedToGather &&
9567 [
this](
Value *V) { return EphValues.contains(V); }) &&
9569 TE->Scalars.size() < Limit ||
9570 ((
TE->getOpcode() == Instruction::ExtractElement ||
9571 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9573 (
TE->State == TreeEntry::NeedToGather &&
9574 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9578 if (VectorizableTree.size() == 1 &&
9579 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9581 AreVectorizableGathers(VectorizableTree[0].
get(),
9582 VectorizableTree[0]->Scalars.size()) &&
9583 VectorizableTree[0]->getVectorFactor() > 2)))
9586 if (VectorizableTree.size() != 2)
9594 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9595 AreVectorizableGathers(VectorizableTree[1].
get(),
9596 VectorizableTree[0]->Scalars.size()))
9600 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9601 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9602 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9603 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9611 bool MustMatchOrInst) {
9615 Value *ZextLoad = Root;
9616 const APInt *ShAmtC;
9617 bool FoundOr =
false;
9618 while (!isa<ConstantExpr>(ZextLoad) &&
9621 ShAmtC->
urem(8) == 0))) {
9622 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9623 ZextLoad = BinOp->getOperand(0);
9624 if (BinOp->getOpcode() == Instruction::Or)
9629 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9636 Type *SrcTy = Load->getType();
9643 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9644 << *(cast<Instruction>(Root)) <<
"\n");
9653 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9654 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9662 unsigned NumElts = Stores.
size();
9663 for (
Value *Scalar : Stores) {
9674 if (VectorizableTree.size() == 2 &&
9675 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9676 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9677 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9678 !(
isSplat(VectorizableTree[1]->Scalars) ||
9686 constexpr int Limit = 4;
9688 !VectorizableTree.empty() &&
9689 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9690 return (TE->State == TreeEntry::NeedToGather &&
9691 TE->getOpcode() != Instruction::ExtractElement &&
9692 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9693 TE->getOpcode() == Instruction::PHI;
9704 if (isFullyVectorizableTinyTree(ForReduction))
9709 bool IsAllowedSingleBVNode =
9710 VectorizableTree.size() > 1 ||
9711 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9712 !VectorizableTree.front()->isAltShuffle() &&
9713 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9714 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9716 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9717 return TE->State == TreeEntry::NeedToGather &&
9719 return isa<ExtractElementInst, UndefValue>(V) ||
9720 (IsAllowedSingleBVNode &&
9721 !V->hasNUsesOrMore(UsesLimit) &&
9722 any_of(V->users(), IsaPred<InsertElementInst>));
9727 assert(VectorizableTree.empty()
9728 ? ExternalUses.empty()
9729 :
true &&
"We shouldn't have any external users");
9741 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9754 for (
const auto &TEPtr : VectorizableTree) {
9755 if (TEPtr->State != TreeEntry::Vectorize)
9757 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9763 auto *NodeA = DT->
getNode(
A->getParent());
9764 auto *NodeB = DT->
getNode(
B->getParent());
9765 assert(NodeA &&
"Should only process reachable instructions");
9766 assert(NodeB &&
"Should only process reachable instructions");
9767 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9768 "Different nodes should have different DFS numbers");
9770 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9771 return B->comesBefore(
A);
9781 LiveValues.
erase(PrevInst);
9782 for (
auto &J : PrevInst->
operands()) {
9783 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9784 LiveValues.
insert(cast<Instruction>(&*J));
9788 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
9789 for (
auto *
X : LiveValues)
9790 dbgs() <<
" " <<
X->getName();
9791 dbgs() <<
", Looking at ";
9796 unsigned NumCalls = 0;
9800 while (InstIt != PrevInstIt) {
9802 PrevInstIt = Inst->getParent()->rbegin();
9807 if (
auto *II = dyn_cast<IntrinsicInst>(
I)) {
9808 if (II->isAssumeLikeIntrinsic())
9812 for (
auto &ArgOp : II->args())
9814 if (
auto *FPMO = dyn_cast<FPMathOperator>(II))
9815 FMF = FPMO->getFastMathFlags();
9822 if (IntrCost < CallCost)
9829 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9830 &*PrevInstIt != PrevInst)
9838 for (
auto *II : LiveValues) {
9839 auto *ScalarTy = II->getType();
9840 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9841 ScalarTy = VectorTy->getElementType();
9859 const auto *I1 = IE1;
9860 const auto *I2 = IE2;
9872 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9874 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9875 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
9877 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9878 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9886 template <
typename U>
9887 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
9890 template <
typename U>
9891 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
9909template <
typename T>
9915 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
9917 auto VMIt = std::next(ShuffleMask.begin());
9920 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9922 if (!IsBaseUndef.
all()) {
9924 std::pair<T *, bool> Res =
9925 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
9927 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
9931 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
9933 auto *V = ValueSelect::get<T *>(
Base);
9935 assert((!V || GetVF(V) == Mask.size()) &&
9936 "Expected base vector of VF number of elements.");
9937 Prev = Action(Mask, {
nullptr, Res.first});
9938 }
else if (ShuffleMask.size() == 1) {
9941 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9947 Prev = Action(Mask, {ShuffleMask.begin()->first});
9951 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9952 unsigned Vec2VF = GetVF(VMIt->first);
9953 if (Vec1VF == Vec2VF) {
9957 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9960 Mask[
I] = SecMask[
I] + Vec1VF;
9963 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9966 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9968 std::pair<T *, bool> Res2 =
9969 ResizeAction(VMIt->first, VMIt->second,
false);
9971 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9978 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
9981 Prev = Action(Mask, {Res1.first, Res2.first});
9983 VMIt = std::next(VMIt);
9985 bool IsBaseNotUndef = !IsBaseUndef.
all();
9986 (void)IsBaseNotUndef;
9988 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9990 std::pair<T *, bool> Res =
9991 ResizeAction(VMIt->first, VMIt->second,
false);
9993 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9996 "Multiple uses of scalars.");
9997 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
10002 Prev = Action(Mask, {Prev, Res.first});
10010 << VectorizableTree.size() <<
".\n");
10012 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10015 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
10016 TreeEntry &TE = *VectorizableTree[
I];
10017 if (TE.State == TreeEntry::NeedToGather) {
10018 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
10019 E && E->getVectorFactor() == TE.getVectorFactor() &&
10020 E->isSame(TE.Scalars)) {
10025 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10034 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10044 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10045 for (ExternalUser &EU : ExternalUses) {
10047 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10048 !ExtractCostCalculated.
insert(EU.Scalar).second)
10054 if (EphValues.
count(EU.User))
10058 if (isa<FixedVectorType>(EU.Scalar->getType()))
10063 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10064 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
10065 if (!UsedInserts.
insert(VU).second)
10069 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10072 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10074 VU, cast<InsertElementInst>(Pair.first),
10076 Value *Op0 = II->getOperand(0);
10077 if (getTreeEntry(II) && !getTreeEntry(Op0))
10083 if (It == FirstUsers.
end()) {
10090 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10091 if (IEBase != EU.User &&
10092 (!IEBase->hasOneUse() ||
10096 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10099 IEBase = cast<InsertElementInst>(
Base);
10102 "InsertElementInstruction used already.");
10104 Base = IEBase->getOperand(0);
10105 }
while (E == getTreeEntry(
Base));
10108 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10112 VecId = FirstUsers.
size() - 1;
10113 auto It = MinBWs.
find(ScalarTE);
10114 if (It != MinBWs.
end() &&
10116 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10118 unsigned BWSz = It->second.first;
10119 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10120 unsigned VecOpcode;
10121 if (DstBWSz < BWSz)
10122 VecOpcode = Instruction::Trunc;
10125 It->second.second ? Instruction::SExt : Instruction::ZExt;
10131 FTy->getNumElements()),
10134 <<
" for extending externally used vector with "
10135 "non-equal minimum bitwidth.\n");
10141 VecId = std::distance(FirstUsers.
begin(), It);
10143 int InIdx = *InsertIdx;
10147 Mask[InIdx] = EU.Lane;
10148 DemandedElts[VecId].setBit(InIdx);
10156 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10157 if (!ValueToExtUses) {
10158 ValueToExtUses.emplace();
10160 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10166 if (!getTreeEntry(V))
10168 auto It = ValueToExtUses->find(V);
10169 if (It != ValueToExtUses->end()) {
10171 ExternalUses[It->second].User = nullptr;
10176 if (CanBeUsedAsGEP) {
10178 ExternalUsesAsGEPs.
insert(EU.Scalar);
10187 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10188 if (It != MinBWs.
end()) {
10191 It->second.second ? Instruction::SExt : Instruction::ZExt;
10201 if (!VectorizedVals.
empty()) {
10202 const TreeEntry &Root = *VectorizableTree.front().get();
10203 auto BWIt = MinBWs.find(&Root);
10204 if (BWIt != MinBWs.end()) {
10205 Type *DstTy = Root.Scalars.front()->getType();
10206 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10208 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10209 if (OriginalSz != SrcSz) {
10210 unsigned Opcode = Instruction::Trunc;
10211 if (OriginalSz > SrcSz)
10212 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10222 Cost += SpillCost + ExtractCost;
10226 unsigned VF =
Mask.size();
10227 unsigned VecVF =
TE->getVectorFactor();
10229 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10232 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10238 dbgs() <<
"SLP: Adding cost " <<
C
10239 <<
" for final shuffle of insertelement external users.\n";
10240 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10242 return std::make_pair(TE,
true);
10244 return std::make_pair(TE,
false);
10247 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10248 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10249 auto Vector = ShuffleMasks[
I].takeVector();
10253 assert((TEs.size() == 1 || TEs.size() == 2) &&
10254 "Expected exactly 1 or 2 tree entries.");
10255 if (TEs.size() == 1) {
10257 VF = TEs.front()->getVectorFactor();
10263 (
Data.index() < VF &&
10264 static_cast<int>(
Data.index()) ==
Data.value());
10269 <<
" for final shuffle of insertelement "
10270 "external users.\n";
10271 TEs.front()->
dump();
10272 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10278 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10279 VF = TEs.front()->getVectorFactor();
10288 <<
" for final shuffle of vector node and external "
10289 "insertelement users.\n";
10290 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10291 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10297 (void)performExtractsShuffleAction<const TreeEntry>(
10299 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10300 EstimateShufflesCost);
10302 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10304 Cost -= InsertCost;
10308 if (ReductionBitWidth != 0) {
10309 assert(UserIgnoreList &&
"Expected reduction tree.");
10310 const TreeEntry &E = *VectorizableTree.front().get();
10311 auto It = MinBWs.find(&E);
10312 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10313 unsigned SrcSize = It->second.first;
10314 unsigned DstSize = ReductionBitWidth;
10315 unsigned Opcode = Instruction::Trunc;
10316 if (SrcSize < DstSize)
10317 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10324 switch (E.getOpcode()) {
10325 case Instruction::SExt:
10326 case Instruction::ZExt:
10327 case Instruction::Trunc: {
10328 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10329 CCH = getCastContextHint(*OpTE);
10339 <<
" for final resize for reduction from " << SrcVecTy
10340 <<
" to " << DstVecTy <<
"\n";
10341 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10349 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10350 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10351 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10355 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10366std::optional<TTI::ShuffleKind>
10367BoUpSLP::tryToGatherSingleRegisterExtractElements(
10373 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10374 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10376 if (isa<UndefValue>(VL[
I]))
10380 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10381 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10390 ExtractMask.reset(*
Idx);
10395 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10399 for (
const auto &
Data : VectorOpToIdx)
10400 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
10401 .push_back(
Data.first);
10402 for (
auto &
Data : VFToVector) {
10404 return VectorOpToIdx.find(V1)->second.size() >
10405 VectorOpToIdx.find(V2)->second.size();
10410 const int UndefSz = UndefVectorExtracts.
size();
10411 unsigned SingleMax = 0;
10412 Value *SingleVec =
nullptr;
10413 unsigned PairMax = 0;
10414 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
10415 for (
auto &
Data : VFToVector) {
10417 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
10418 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10422 if (
Data.second.size() > 1)
10423 V2 = *std::next(
Data.second.begin());
10424 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
10426 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
10427 PairVec = std::make_pair(V1, V2);
10430 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10431 return std::nullopt;
10437 if (SingleMax >= PairMax && SingleMax) {
10438 for (
int Idx : VectorOpToIdx[SingleVec])
10441 for (
Value *V : {PairVec.first, PairVec.second})
10442 for (
int Idx : VectorOpToIdx[V])
10446 for (
int Idx : UndefVectorExtracts)
10450 std::optional<TTI::ShuffleKind> Res =
10456 return std::nullopt;
10460 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10461 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10462 isa<UndefValue>(GatheredExtracts[
I])) {
10466 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10467 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10468 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10483 unsigned NumParts)
const {
10484 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10487 unsigned SliceSize = VL.
size() / NumParts;
10488 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10494 std::optional<TTI::ShuffleKind> Res =
10495 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10496 ShufflesRes[Part] = Res;
10497 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10499 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10500 return Res.has_value();
10502 ShufflesRes.clear();
10503 return ShufflesRes;
10506std::optional<TargetTransformInfo::ShuffleKind>
10507BoUpSLP::isGatherShuffledSingleRegisterEntry(
10513 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10514 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10518 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10519 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10522 TEInsertBlock = TEInsertPt->
getParent();
10525 return std::nullopt;
10526 auto *NodeUI = DT->
getNode(TEInsertBlock);
10527 assert(NodeUI &&
"Should only process reachable instructions");
10529 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10543 auto *NodeEUI = DT->
getNode(InsertBlock);
10546 assert((NodeUI == NodeEUI) ==
10547 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10548 "Different nodes should have different DFS numbers");
10550 if (TEInsertPt->
getParent() != InsertBlock &&
10553 if (TEInsertPt->
getParent() == InsertBlock &&
10567 for (
Value *V : VL) {
10572 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10576 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10577 "Must contain at least single gathered value.");
10578 assert(TEPtr->UserTreeIndices.size() == 1 &&
10579 "Expected only single user of a gather node.");
10580 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10582 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10585 : &getLastInstructionInBundle(UseEI.UserTE);
10586 if (TEInsertPt == InsertPt) {
10590 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10594 if (TEUseEI.UserTE != UseEI.UserTE &&
10595 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10601 if ((TEInsertBlock != InsertPt->
getParent() ||
10602 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10603 !CheckOrdering(InsertPt))
10607 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10609 if (VTE->State != TreeEntry::Vectorize) {
10610 auto It = MultiNodeScalars.
find(V);
10611 if (It == MultiNodeScalars.
end())
10613 VTE = *It->getSecond().begin();
10615 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10616 return MTE->State == TreeEntry::Vectorize;
10618 if (MIt == It->getSecond().end())
10623 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10624 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10628 if (VToTEs.
empty())
10630 if (UsedTEs.
empty()) {
10644 if (!VToTEs.
empty()) {
10650 VToTEs = SavedVToTEs;
10659 if (UsedTEs.
size() == 2)
10661 UsedTEs.push_back(SavedVToTEs);
10668 if (UsedTEs.
empty()) {
10670 return std::nullopt;
10674 if (UsedTEs.
size() == 1) {
10677 UsedTEs.front().
end());
10678 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10679 return TE1->Idx < TE2->Idx;
10682 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10683 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10685 if (It != FirstEntries.end() &&
10686 ((*It)->getVectorFactor() == VL.size() ||
10687 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10688 TE->ReuseShuffleIndices.size() == VL.size() &&
10689 (*It)->isSame(
TE->Scalars)))) {
10690 Entries.push_back(*It);
10691 if ((*It)->getVectorFactor() == VL.size()) {
10692 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10693 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10699 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10700 if (isa<PoisonValue>(VL[
I]))
10706 Entries.push_back(FirstEntries.front());
10709 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10712 for (
const TreeEntry *TE : UsedTEs.front()) {
10713 unsigned VF =
TE->getVectorFactor();
10714 auto It = VFToTE.
find(VF);
10715 if (It != VFToTE.
end()) {
10716 if (It->second->Idx >
TE->Idx)
10717 It->getSecond() =
TE;
10724 UsedTEs.back().
end());
10725 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10726 return TE1->Idx < TE2->Idx;
10728 for (
const TreeEntry *TE : SecondEntries) {
10729 auto It = VFToTE.
find(
TE->getVectorFactor());
10730 if (It != VFToTE.
end()) {
10732 Entries.push_back(It->second);
10733 Entries.push_back(TE);
10739 if (Entries.empty()) {
10741 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10742 return TE1->Idx < TE2->Idx;
10744 Entries.push_back(SecondEntries.front());
10745 VF = std::max(Entries.front()->getVectorFactor(),
10746 Entries.back()->getVectorFactor());
10750 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
10753 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10754 auto *
PHI = cast<PHINode>(V);
10755 auto *PHI1 = cast<PHINode>(V1);
10760 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
10762 Value *In1 = PHI1->getIncomingValue(
I);
10767 if (cast<Instruction>(In)->
getParent() !=
10777 auto MightBeIgnored = [=](
Value *
V) {
10778 auto *
I = dyn_cast<Instruction>(V);
10779 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
10781 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
10786 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
10788 bool UsedInSameVTE =
false;
10789 auto It = UsedValuesEntry.
find(V1);
10790 if (It != UsedValuesEntry.
end())
10791 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
10792 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10794 cast<Instruction>(V)->getParent() ==
10795 cast<Instruction>(V1)->getParent() &&
10796 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10801 for (
int I = 0, E = VL.size();
I < E; ++
I) {
10803 auto It = UsedValuesEntry.
find(V);
10804 if (It == UsedValuesEntry.
end())
10810 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
10811 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
10813 unsigned Idx = It->second;
10820 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
10821 if (!UsedIdxs.test(
I))
10827 for (std::pair<unsigned, int> &Pair : EntryLanes)
10828 if (Pair.first ==
I)
10829 Pair.first = TempEntries.
size();
10832 Entries.swap(TempEntries);
10833 if (EntryLanes.size() == Entries.size() &&
10835 .
slice(Part * VL.size(),
10836 std::min<int>(VL.size(),
TE->Scalars.size())))) {
10842 return std::nullopt;
10845 bool IsIdentity = Entries.size() == 1;
10848 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
10849 unsigned Idx = Part * VL.size() + Pair.second;
10852 (ForOrder ? std::distance(
10853 Entries[Pair.first]->Scalars.begin(),
10854 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10855 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10856 IsIdentity &=
Mask[
Idx] == Pair.second;
10858 switch (Entries.size()) {
10860 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10864 if (EntryLanes.size() > 2 || VL.size() <= 2)
10872 std::fill(std::next(
Mask.begin(), Part * VL.size()),
10874 return std::nullopt;
10878BoUpSLP::isGatherShuffledEntry(
10882 assert(NumParts > 0 && NumParts < VL.
size() &&
10883 "Expected positive number of registers.");
10886 if (TE == VectorizableTree.front().get())
10889 if (
TE->isNonPowOf2Vec())
10892 assert(
TE->UserTreeIndices.size() == 1 &&
10893 "Expected only single user of the gather node.");
10895 "Number of scalars must be divisible by NumParts.");
10896 unsigned SliceSize = VL.
size() / NumParts;
10898 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10901 std::optional<TTI::ShuffleKind> SubRes =
10902 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10905 SubEntries.
clear();
10908 SubEntries.
front()->getVectorFactor() == VL.
size() &&
10909 (SubEntries.
front()->isSame(
TE->Scalars) ||
10910 SubEntries.
front()->isSame(VL))) {
10912 LocalSubEntries.
swap(SubEntries);
10915 std::iota(
Mask.begin(),
Mask.end(), 0);
10917 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
10918 if (isa<PoisonValue>(VL[
I]))
10920 Entries.emplace_back(1, LocalSubEntries.
front());
10926 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
10934 Type *ScalarTy)
const {
10936 bool DuplicateNonConst =
false;
10944 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
10945 if (
V->getType() != ScalarTy) {
10956 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
10959 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
10967 EstimateInsertCost(
I, V);
10968 ShuffleMask[
I] =
I;
10972 DuplicateNonConst =
true;
10974 ShuffleMask[
I] = Res.first->second;
10980 if (DuplicateNonConst)
10982 VecTy, ShuffleMask);
10994 VLOperands Ops(VL, R);
10997 Left = Ops.getVL(0);
10998 Right = Ops.getVL(1);
11001Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
11004 return *Res.second;
11008 auto *Front = E->getMainOp();
11011 if (E->getOpcode() == Instruction::GetElementPtr &&
11012 !isa<GetElementPtrInst>(V))
11014 auto *I = cast<Instruction>(V);
11015 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11016 isVectorLikeInstWithConstOps(I);
11019 auto FindLastInst = [&]() {
11021 for (
Value *V : E->Scalars) {
11022 auto *
I = dyn_cast<Instruction>(V);
11025 if (LastInst->
getParent() ==
I->getParent()) {
11030 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11031 !isa<GetElementPtrInst>(
I)) ||
11034 "Expected vector-like or non-GEP in GEP node insts only.");
11042 auto *NodeB = DT->
getNode(
I->getParent());
11043 assert(NodeA &&
"Should only process reachable instructions");
11044 assert(NodeB &&
"Should only process reachable instructions");
11045 assert((NodeA == NodeB) ==
11046 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11047 "Different nodes should have different DFS numbers");
11048 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11055 auto FindFirstInst = [&]() {
11057 for (
Value *V : E->Scalars) {
11058 auto *
I = dyn_cast<Instruction>(V);
11061 if (FirstInst->
getParent() ==
I->getParent()) {
11062 if (
I->comesBefore(FirstInst))
11066 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11067 !isa<GetElementPtrInst>(
I)) ||
11070 "Expected vector-like or non-GEP in GEP node insts only.");
11078 auto *NodeB = DT->
getNode(
I->getParent());
11079 assert(NodeA &&
"Should only process reachable instructions");
11080 assert(NodeB &&
"Should only process reachable instructions");
11081 assert((NodeA == NodeB) ==
11082 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11083 "Different nodes should have different DFS numbers");
11084 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11093 (E->State != TreeEntry::NeedToGather &&
11095 if ((E->getOpcode() == Instruction::GetElementPtr &&
11098 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11102 return !isVectorLikeInstWithConstOps(V) &&
11103 isUsedOutsideBlock(V);
11105 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11107 return isa<ExtractElementInst, UndefValue>(V) ||
11108 areAllOperandsNonInsts(V);
11110 Res.second = FindLastInst();
11112 Res.second = FindFirstInst();
11113 return *Res.second;
11120 if (BlocksSchedules.count(BB)) {
11121 Value *
V = E->isOneOf(E->Scalars.back());
11124 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11125 if (Bundle && Bundle->isPartOfBundle())
11126 for (; Bundle; Bundle = Bundle->NextInBundle)
11127 if (Bundle->OpValue == Bundle->Inst)
11128 Res.second = Bundle->Inst;
11150 Res.second = FindLastInst();
11151 assert(Res.second &&
"Failed to find last instruction in bundle");
11152 return *Res.second;
11155void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11156 auto *Front = E->getMainOp();
11157 Instruction *LastInst = &getLastInstructionInBundle(E);
11158 assert(LastInst &&
"Failed to find last instruction in bundle");
11161 bool IsPHI = isa<PHINode>(LastInst);
11164 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11166 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11170 Builder.SetInsertPoint(
11174 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11184 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11187 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11188 InsertBB = InsertBB->getSinglePredecessor();
11189 return InsertBB && InsertBB == InstBB;
11191 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11192 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11193 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11194 getTreeEntry(Inst) ||
11195 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11196 PostponedIndices.
insert(
I).second)
11200 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11203 if (
Scalar->getType() != Ty) {
11205 "Expected integer types only.");
11206 Scalar = Builder.CreateIntCast(
11210 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11211 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11214 GatherShuffleExtractSeq.
insert(InsElt);
11215 CSEBlocks.
insert(InsElt->getParent());
11217 if (isa<Instruction>(V)) {
11218 if (TreeEntry *Entry = getTreeEntry(V)) {
11220 User *UserOp =
nullptr;
11222 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11228 unsigned FoundLane = Entry->findLaneForValue(V);
11229 ExternalUses.emplace_back(V, UserOp, FoundLane);
11239 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11247 if (!isa<UndefValue>(VL[
I])) {
11251 if (isa<PoisonValue>(VL[
I]))
11253 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11258 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11261 for (
int I : NonConsts)
11262 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11265 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11266 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11304 bool IsFinalized =
false;
11314 Type *ScalarTy =
nullptr;
11318 class ShuffleIRBuilder {
11331 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11332 CSEBlocks(CSEBlocks),
DL(
DL) {}
11333 ~ShuffleIRBuilder() =
default;
11336 if (V1->
getType() != V2->getType()) {
11339 "Expected integer vector types only.");
11340 if (V1->
getType() != V2->getType()) {
11341 if (cast<VectorType>(V2->getType())
11343 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11345 ->getIntegerBitWidth())
11354 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11355 GatherShuffleExtractSeq.
insert(
I);
11356 CSEBlocks.
insert(
I->getParent());
11365 unsigned VF = Mask.size();
11366 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11370 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11371 GatherShuffleExtractSeq.
insert(
I);
11372 CSEBlocks.
insert(
I->getParent());
11376 Value *createIdentity(
Value *V) {
return V; }
11377 Value *createPoison(
Type *Ty,
unsigned VF) {
11382 void resizeToMatch(
Value *&V1,
Value *&V2) {
11383 if (V1->
getType() == V2->getType())
11385 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11386 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11387 int VF = std::max(V1VF, V2VF);
11388 int MinVF = std::min(V1VF, V2VF);
11390 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11392 Value *&
Op = MinVF == V1VF ? V1 : V2;
11394 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11395 GatherShuffleExtractSeq.
insert(
I);
11396 CSEBlocks.
insert(
I->getParent());
11409 assert(V1 &&
"Expected at least one vector value.");
11410 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11411 R.CSEBlocks, *R.DL);
11412 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11420 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11428 auto *VecTy = cast<VectorType>(V->getType());
11438 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11442 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11443 unsigned NumParts,
bool &UseVecBaseAsInput) {
11444 UseVecBaseAsInput =
false;
11446 Value *VecBase =
nullptr;
11447 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11451 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11452 VecBase = EI->getVectorOperand();
11453 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11454 VecBase = TE->VectorizedValue;
11455 assert(VecBase &&
"Expected vectorized value.");
11456 UniqueBases.
insert(VecBase);
11459 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11461 const TreeEntry *UTE = R.getTreeEntry(U);
11462 return !UTE || R.MultiNodeScalars.contains(U) ||
11463 (isa<GetElementPtrInst>(U) &&
11464 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11465 count_if(R.VectorizableTree,
11466 [&](const std::unique_ptr<TreeEntry> &TE) {
11467 return any_of(TE->UserTreeIndices,
11468 [&](const EdgeInfo &Edge) {
11469 return Edge.UserTE == UTE;
11471 is_contained(TE->Scalars, EI);
11475 R.eraseInstruction(EI);
11477 if (NumParts == 1 || UniqueBases.
size() == 1) {
11478 VecBase = castToScalarTyElem(VecBase);
11481 UseVecBaseAsInput =
true;
11491 Value *Vec =
nullptr;
11493 unsigned SliceSize = E->Scalars.size() / NumParts;
11494 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11498 constexpr int MaxBases = 2;
11506 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11507 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11508 VecOp = TE->VectorizedValue;
11509 assert(VecOp &&
"Expected vectorized value.");
11511 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
11513 assert((PrevSize ==
Size || PrevSize == 0) &&
11514 "Expected vectors of the same size.");
11517 VecOp = castToScalarTyElem(VecOp);
11518 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
11520 if (!Bases.front())
11523 if (Bases.back()) {
11524 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11525 TransformToIdentity(SubMask);
11527 SubVec = Bases.front();
11534 Mask.slice(
P * SliceSize, SliceSize);
11539 "Expected first part or all previous parts masked.");
11540 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11542 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11544 unsigned SubVecVF =
11545 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11546 VF = std::max(VF, SubVecVF);
11549 for (
int &
Idx : SubMask)
11552 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11553 Vec = createShuffle(Vec, SubVec, VecMask);
11554 TransformToIdentity(VecMask);
11562 std::optional<Value *>
11568 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
11570 return std::nullopt;
11582 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11587 add(E1.VectorizedValue, Mask);
11591 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
11592 V1 = castToScalarTyElem(V1);
11593 V2 = castToScalarTyElem(V2);
11594 if (InVectors.
empty()) {
11597 CommonMask.
assign(Mask.begin(), Mask.end());
11601 if (InVectors.
size() == 2) {
11602 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11603 transformMaskAfterShuffle(CommonMask, CommonMask);
11604 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
11606 Vec = createShuffle(Vec,
nullptr, CommonMask);
11607 transformMaskAfterShuffle(CommonMask, CommonMask);
11609 V1 = createShuffle(V1, V2, Mask);
11610 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11612 CommonMask[
Idx] =
Idx + Sz;
11613 InVectors.
front() = Vec;
11614 if (InVectors.
size() == 2)
11615 InVectors.
back() = V1;
11621 V1 = castToScalarTyElem(V1);
11622 if (InVectors.
empty()) {
11623 if (!isa<FixedVectorType>(V1->
getType())) {
11624 V1 = createShuffle(V1,
nullptr, CommonMask);
11626 transformMaskAfterShuffle(CommonMask, Mask);
11629 CommonMask.
assign(Mask.begin(), Mask.end());
11632 const auto *It =
find(InVectors, V1);
11633 if (It == InVectors.
end()) {
11634 if (InVectors.
size() == 2 ||
11636 !isa<FixedVectorType>(V1->
getType())) {
11638 if (InVectors.
size() == 2) {
11639 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11640 transformMaskAfterShuffle(CommonMask, CommonMask);
11641 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11642 CommonMask.
size()) {
11643 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11644 transformMaskAfterShuffle(CommonMask, CommonMask);
11646 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11649 V->getType() != V1->
getType()
11651 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11652 ->getNumElements();
11653 if (V->getType() != V1->
getType())
11654 V1 = createShuffle(V1,
nullptr, Mask);
11655 InVectors.
front() = V;
11656 if (InVectors.
size() == 2)
11657 InVectors.
back() = V1;
11664 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11670 int VF = CommonMask.
size();
11671 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11672 VF = FTy->getNumElements();
11673 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11675 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11684 Value *Root =
nullptr) {
11685 return R.gather(VL, Root, ScalarTy);
11694 IsFinalized =
true;
11697 if (InVectors.
size() == 2) {
11698 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11701 Vec = createShuffle(Vec,
nullptr, CommonMask);
11703 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11707 "Expected vector length for the final value before action.");
11708 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11711 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11712 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11714 Action(Vec, CommonMask);
11715 InVectors.
front() = Vec;
11717 if (!ExtMask.
empty()) {
11718 if (CommonMask.
empty()) {
11722 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11725 NewMask[
I] = CommonMask[ExtMask[
I]];
11727 CommonMask.
swap(NewMask);
11730 if (CommonMask.
empty()) {
11731 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11732 return InVectors.
front();
11734 if (InVectors.
size() == 2)
11735 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11736 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
11741 "Shuffle construction must be finalized.");
11745Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
11746 bool PostponedPHIs) {
11747 ValueList &VL = E->getOperand(NodeIdx);
11748 const unsigned VF = VL.size();
11751 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11752 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11753 if (It != VL.end())
11756 if (S.getOpcode()) {
11757 auto CheckSameVE = [&](
const TreeEntry *VE) {
11758 return VE->isSame(VL) &&
11759 (
any_of(VE->UserTreeIndices,
11760 [E, NodeIdx](
const EdgeInfo &EI) {
11761 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11763 any_of(VectorizableTree,
11764 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
11765 return TE->isOperandGatherNode({E, NodeIdx}) &&
11766 VE->isSame(TE->Scalars);
11769 TreeEntry *VE = getTreeEntry(S.OpValue);
11770 bool IsSameVE = VE && CheckSameVE(VE);
11772 auto It = MultiNodeScalars.
find(S.OpValue);
11773 if (It != MultiNodeScalars.
end()) {
11774 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
11775 return TE != VE && CheckSameVE(TE);
11777 if (
I != It->getSecond().end()) {
11785 ShuffleInstructionBuilder ShuffleBuilder(
11786 cast<VectorType>(
V->getType())->getElementType(), Builder, *
this);
11787 ShuffleBuilder.add(V, Mask);
11788 return ShuffleBuilder.finalize(std::nullopt);
11791 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
11792 if (!VE->ReuseShuffleIndices.empty()) {
11813 if (isa<PoisonValue>(V))
11815 Mask[
I] = VE->findLaneForValue(V);
11817 V = FinalShuffle(V, Mask);
11819 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
11820 "Expected vectorization factor less "
11821 "than original vector size.");
11823 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11824 V = FinalShuffle(V, UniformMask);
11830 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11831 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11832 }) == VE->UserTreeIndices.end()) {
11834 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11835 return TE->State == TreeEntry::NeedToGather &&
11836 TE->UserTreeIndices.front().UserTE == E &&
11837 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11839 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
11840 (*It)->VectorizedValue =
V;
11849 auto *
I =
find_if(VectorizableTree,
11850 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
11851 return TE->isOperandGatherNode({E, NodeIdx});
11853 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
11854 assert(
I->get()->UserTreeIndices.size() == 1 &&
11855 "Expected only single user for the gather node.");
11856 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
11860template <
typename BVTy,
typename ResTy,
typename...
Args>
11861ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
11863 assert(E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
11864 unsigned VF = E->getVectorFactor();
11866 bool NeedFreeze =
false;
11868 E->ReuseShuffleIndices.end());
11874 if (!ReorderMask.
empty())
11877 unsigned I,
unsigned SliceSize) {
11879 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11882 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11883 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11884 if (UserTE->getNumOperands() != 2)
11887 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
11888 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
11889 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11890 }) !=
TE->UserTreeIndices.end();
11892 if (It == VectorizableTree.end())
11895 if ((
Mask.size() < InputVF &&
11898 (
Mask.size() == InputVF &&
11900 std::iota(std::next(
Mask.begin(),
I * SliceSize),
11901 std::next(
Mask.begin(), (
I + 1) * SliceSize), 0);
11905 std::fill(std::next(
Mask.begin(),
I * SliceSize),
11906 std::next(
Mask.begin(), (
I + 1) * SliceSize), IVal);
11910 BVTy ShuffleBuilder(ScalarTy, Params...);
11911 ResTy Res = ResTy();
11915 Value *ExtractVecBase =
nullptr;
11916 bool UseVecBaseAsInput =
false;
11919 Type *OrigScalarTy = GatheredScalars.front()->getType();
11922 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11924 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
11926 bool Resized =
false;
11928 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11929 if (!ExtractShuffles.
empty()) {
11934 if (
const auto *TE = getTreeEntry(
11935 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
11938 if (std::optional<ResTy> Delayed =
11939 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11941 PostponedGathers.
insert(E);
11946 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
11947 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11948 ExtractVecBase = VecBase;
11949 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11950 if (VF == VecBaseTy->getNumElements() &&
11951 GatheredScalars.size() != VF) {
11953 GatheredScalars.append(VF - GatheredScalars.size(),
11959 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
11960 E->isAltShuffle() ||
11961 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
11963 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11965 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11967 if (!GatherShuffles.
empty()) {
11968 if (std::optional<ResTy> Delayed =
11969 ShuffleBuilder.needToDelay(E, Entries)) {
11971 PostponedGathers.
insert(E);
11976 if (GatherShuffles.
size() == 1 &&
11978 Entries.front().front()->isSame(E->Scalars)) {
11983 <<
"SLP: perfect diamond match for gather bundle "
11986 Mask.resize(E->Scalars.size());
11987 const TreeEntry *FrontTE = Entries.front().front();
11988 if (FrontTE->ReorderIndices.empty() &&
11989 ((FrontTE->ReuseShuffleIndices.empty() &&
11990 E->Scalars.size() == FrontTE->Scalars.size()) ||
11991 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11992 std::iota(
Mask.begin(),
Mask.end(), 0);
11995 if (isa<PoisonValue>(V)) {
11999 Mask[
I] = FrontTE->findLaneForValue(V);
12002 ShuffleBuilder.add(*FrontTE, Mask);
12003 Res = ShuffleBuilder.finalize(E->getCommonMask());
12007 if (GatheredScalars.size() != VF &&
12009 return any_of(TEs, [&](
const TreeEntry *TE) {
12010 return TE->getVectorFactor() == VF;
12013 GatheredScalars.append(VF - GatheredScalars.size(),
12017 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
12025 bool IsRootPoison) {
12028 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
12035 int NumNonConsts = 0;
12038 if (isa<UndefValue>(V)) {
12039 if (!isa<PoisonValue>(V)) {
12054 Scalars.
front() = OrigV;
12057 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
12058 Scalars[Res.first->second] = OrigV;
12059 ReuseMask[
I] = Res.first->second;
12062 if (NumNonConsts == 1) {
12067 if (!UndefPos.
empty() && UndefPos.
front() == 0)
12070 ReuseMask[SinglePos] = SinglePos;
12071 }
else if (!UndefPos.
empty() && IsSplat) {
12076 return !isa<UndefValue>(V) &&
12078 (E->UserTreeIndices.size() == 1 &&
12082 return E->UserTreeIndices.front().EdgeIdx !=
12083 U.getOperandNo() &&
12085 E->UserTreeIndices.front().UserTE->Scalars,
12089 if (It != Scalars.
end()) {
12091 int Pos = std::distance(Scalars.
begin(), It);
12092 for (
int I : UndefPos) {
12094 ReuseMask[
I] = Pos;
12103 for (
int I : UndefPos) {
12105 if (isa<UndefValue>(Scalars[
I]))
12112 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12113 bool IsNonPoisoned =
true;
12114 bool IsUsedInExpr =
true;
12115 Value *Vec1 =
nullptr;
12116 if (!ExtractShuffles.
empty()) {
12120 Value *Vec2 =
nullptr;
12121 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12125 if (UseVecBaseAsInput) {
12126 Vec1 = ExtractVecBase;
12128 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12131 if (isa<UndefValue>(E->Scalars[
I]))
12133 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12134 Value *VecOp = EI->getVectorOperand();
12135 if (
const auto *TE = getTreeEntry(VecOp))
12136 if (
TE->VectorizedValue)
12137 VecOp =
TE->VectorizedValue;
12140 }
else if (Vec1 != VecOp) {
12141 assert((!Vec2 || Vec2 == VecOp) &&
12142 "Expected only 1 or 2 vectors shuffle.");
12148 IsUsedInExpr =
false;
12151 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12153 IsUsedInExpr &= FindReusedSplat(
12155 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12156 ExtractMask.size());
12157 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12160 IsUsedInExpr =
false;
12165 if (!GatherShuffles.
empty()) {
12166 unsigned SliceSize = E->Scalars.size() / NumParts;
12168 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12171 "No shuffles with empty entries list expected.");
12175 "Expected shuffle of 1 or 2 entries.");
12178 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12179 if (TEs.
size() == 1) {
12180 IsUsedInExpr &= FindReusedSplat(
12181 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12182 ShuffleBuilder.add(*TEs.
front(), VecMask);
12183 if (TEs.
front()->VectorizedValue)
12187 IsUsedInExpr =
false;
12188 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12189 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12200 int EMSz = ExtractMask.size();
12201 int MSz =
Mask.size();
12204 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12205 bool IsIdentityShuffle =
12206 ((UseVecBaseAsInput ||
12208 [](
const std::optional<TTI::ShuffleKind> &SK) {
12212 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12214 (!GatherShuffles.
empty() &&
12216 [](
const std::optional<TTI::ShuffleKind> &SK) {
12220 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12222 bool EnoughConstsForShuffle =
12226 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12230 return isa<Constant>(V) && !isa<UndefValue>(V);
12232 (!IsIdentityShuffle ||
12233 (GatheredScalars.size() == 2 &&
12235 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12237 return isa<Constant>(V) && !isa<PoisonValue>(V);
12241 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12242 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12248 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12250 TryPackScalars(GatheredScalars, BVMask,
true);
12251 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12252 ShuffleBuilder.add(BV, BVMask);
12255 return isa<PoisonValue>(V) ||
12256 (IsSingleShuffle && ((IsIdentityShuffle &&
12257 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12259 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12261 Res = ShuffleBuilder.finalize(
12262 E->ReuseShuffleIndices, E->Scalars.size(),
12264 TryPackScalars(NonConstants, Mask,
false);
12265 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12270 TryPackScalars(GatheredScalars, ReuseMask,
true);
12271 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12272 ShuffleBuilder.add(BV, ReuseMask);
12273 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12278 if (!isa<PoisonValue>(V))
12281 Value *BV = ShuffleBuilder.gather(E->Scalars);
12282 ShuffleBuilder.add(BV, Mask);
12283 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12287 Res = ShuffleBuilder.createFreeze(Res);
12291Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy) {
12292 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12299 if (E->VectorizedValue &&
12300 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12301 E->isAltShuffle())) {
12302 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12303 return E->VectorizedValue;
12306 Value *
V = E->Scalars.front();
12307 Type *ScalarTy =
V->getType();
12308 if (
auto *Store = dyn_cast<StoreInst>(V))
12309 ScalarTy =
Store->getValueOperand()->getType();
12310 else if (
auto *IE = dyn_cast<InsertElementInst>(V))
12311 ScalarTy =
IE->getOperand(1)->getType();
12312 auto It = MinBWs.
find(E);
12313 if (It != MinBWs.
end())
12316 if (E->State == TreeEntry::NeedToGather) {
12318 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12319 setInsertPointAfterBundle(E);
12320 Value *Vec = createBuildVector(E, ScalarTy);
12321 E->VectorizedValue = Vec;
12326 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12327 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
12328 if (E->getOpcode() == Instruction::Store) {
12330 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12331 E->ReorderIndices.size());
12332 ShuffleBuilder.add(V, Mask);
12333 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12334 ShuffleBuilder.addOrdered(V, std::nullopt);
12336 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12338 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12341 assert((E->State == TreeEntry::Vectorize ||
12342 E->State == TreeEntry::ScatterVectorize ||
12343 E->State == TreeEntry::StridedVectorize) &&
12344 "Unhandled state");
12345 unsigned ShuffleOrOp =
12346 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12348 auto GetOperandSignedness = [&](
unsigned Idx) {
12349 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12350 bool IsSigned =
false;
12351 auto It = MinBWs.
find(OpE);
12352 if (It != MinBWs.
end())
12353 IsSigned = It->second.second;
12356 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12360 switch (ShuffleOrOp) {
12361 case Instruction::PHI: {
12362 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12363 E != VectorizableTree.front().get() ||
12364 !E->UserTreeIndices.empty()) &&
12365 "PHI reordering is free.");
12366 if (PostponedPHIs && E->VectorizedValue)
12367 return E->VectorizedValue;
12368 auto *PH = cast<PHINode>(VL0);
12370 PH->getParent()->getFirstNonPHIIt());
12372 if (PostponedPHIs || !E->VectorizedValue) {
12379 PH->getParent()->getFirstInsertionPt());
12382 V = FinalShuffle(V, E, VecTy);
12384 E->VectorizedValue =
V;
12388 PHINode *NewPhi = cast<PHINode>(E->PHI);
12397 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12403 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12407 if (!VisitedBBs.
insert(IBB).second) {
12414 Value *Vec = vectorizeOperand(E,
I,
true);
12415 if (VecTy != Vec->
getType()) {
12417 getOperandEntry(E,
I)->State == TreeEntry::NeedToGather ||
12418 MinBWs.
contains(getOperandEntry(E,
I))) &&
12419 "Expected item in MinBWs.");
12420 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12426 "Invalid number of incoming values");
12430 case Instruction::ExtractElement: {
12431 Value *
V = E->getSingleOperand(0);
12432 if (
const TreeEntry *TE = getTreeEntry(V))
12433 V =
TE->VectorizedValue;
12434 setInsertPointAfterBundle(E);
12435 V = FinalShuffle(V, E, VecTy);
12436 E->VectorizedValue =
V;
12439 case Instruction::ExtractValue: {
12440 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12445 NewV = FinalShuffle(NewV, E, VecTy);
12446 E->VectorizedValue = NewV;
12449 case Instruction::InsertElement: {
12450 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12452 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12454 Type *ScalarTy =
Op.front()->getType();
12455 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12457 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12458 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12463 cast<FixedVectorType>(
V->getType())->getNumElements()),
12468 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12469 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12471 const unsigned NumElts =
12472 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12473 const unsigned NumScalars = E->Scalars.size();
12476 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12480 if (!E->ReorderIndices.empty()) {
12485 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12488 bool IsIdentity =
true;
12490 Mask.swap(PrevMask);
12491 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12494 IsIdentity &= InsertIdx -
Offset ==
I;
12497 if (!IsIdentity || NumElts != NumScalars) {
12501 if (NumElts != NumScalars &&
Offset == 0) {
12510 InsertMask[*InsertIdx] = *InsertIdx;
12511 if (!
Ins->hasOneUse())
12513 Ins = dyn_cast_or_null<InsertElementInst>(
12514 Ins->getUniqueUndroppableUser());
12517 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12519 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12522 if (!IsFirstPoison.
all()) {
12524 for (
unsigned I = 0;
I < NumElts;
I++) {
12526 IsFirstUndef.
test(
I)) {
12527 if (IsVNonPoisonous) {
12528 InsertMask[
I] =
I < NumScalars ?
I : 0;
12533 if (
Idx >= NumScalars)
12534 Idx = NumScalars - 1;
12535 InsertMask[
I] = NumScalars +
Idx;
12549 if (
auto *
I = dyn_cast<Instruction>(V)) {
12550 GatherShuffleExtractSeq.
insert(
I);
12551 CSEBlocks.
insert(
I->getParent());
12556 for (
unsigned I = 0;
I < NumElts;
I++) {
12561 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12564 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
12565 NumElts != NumScalars) {
12566 if (IsFirstUndef.
all()) {
12569 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12570 if (!IsFirstPoison.
all()) {
12571 for (
unsigned I = 0;
I < NumElts;
I++) {
12573 InsertMask[
I] =
I + NumElts;
12580 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
12581 if (
auto *
I = dyn_cast<Instruction>(V)) {
12582 GatherShuffleExtractSeq.
insert(
I);
12583 CSEBlocks.
insert(
I->getParent());
12588 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12589 for (
unsigned I = 0;
I < NumElts;
I++) {
12593 InsertMask[
I] += NumElts;
12596 FirstInsert->getOperand(0), V, InsertMask,
12597 cast<Instruction>(E->Scalars.back())->getName());
12598 if (
auto *
I = dyn_cast<Instruction>(V)) {
12599 GatherShuffleExtractSeq.
insert(
I);
12600 CSEBlocks.
insert(
I->getParent());
12605 ++NumVectorInstructions;
12606 E->VectorizedValue =
V;
12609 case Instruction::ZExt:
12610 case Instruction::SExt:
12611 case Instruction::FPToUI:
12612 case Instruction::FPToSI:
12613 case Instruction::FPExt:
12614 case Instruction::PtrToInt:
12615 case Instruction::IntToPtr:
12616 case Instruction::SIToFP:
12617 case Instruction::UIToFP:
12618 case Instruction::Trunc:
12619 case Instruction::FPTrunc:
12620 case Instruction::BitCast: {
12621 setInsertPointAfterBundle(E);
12623 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12624 if (E->VectorizedValue) {
12625 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12626 return E->VectorizedValue;
12629 auto *CI = cast<CastInst>(VL0);
12631 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
12632 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
12634 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
12637 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12638 if (SrcIt != MinBWs.
end())
12639 SrcBWSz = SrcIt->second.first;
12640 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12641 if (BWSz == SrcBWSz) {
12642 VecOpcode = Instruction::BitCast;
12643 }
else if (BWSz < SrcBWSz) {
12644 VecOpcode = Instruction::Trunc;
12645 }
else if (It != MinBWs.
end()) {
12646 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12647 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12648 }
else if (SrcIt != MinBWs.
end()) {
12649 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12651 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12653 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
12654 !SrcIt->second.second) {
12655 VecOpcode = Instruction::UIToFP;
12657 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12659 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12660 V = FinalShuffle(V, E, VecTy);
12662 E->VectorizedValue =
V;
12663 ++NumVectorInstructions;
12666 case Instruction::FCmp:
12667 case Instruction::ICmp: {
12668 setInsertPointAfterBundle(E);
12670 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
12671 if (E->VectorizedValue) {
12672 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12673 return E->VectorizedValue;
12675 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
12676 if (E->VectorizedValue) {
12677 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12678 return E->VectorizedValue;
12680 if (
L->getType() !=
R->getType()) {
12681 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12682 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12683 MinBWs.
contains(getOperandEntry(E, 0)) ||
12684 MinBWs.
contains(getOperandEntry(E, 1))) &&
12685 "Expected item in MinBWs.");
12686 if (cast<VectorType>(
L->getType())
12688 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
12690 ->getIntegerBitWidth()) {
12691 Type *CastTy =
R->getType();
12694 Type *CastTy =
L->getType();
12703 VecTy = cast<FixedVectorType>(
V->getType());
12704 V = FinalShuffle(V, E, VecTy);
12706 E->VectorizedValue =
V;
12707 ++NumVectorInstructions;
12710 case Instruction::Select: {
12711 setInsertPointAfterBundle(E);
12713 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
12714 if (E->VectorizedValue) {
12715 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12716 return E->VectorizedValue;
12718 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12719 if (E->VectorizedValue) {
12720 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12721 return E->VectorizedValue;
12723 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12724 if (E->VectorizedValue) {
12725 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12726 return E->VectorizedValue;
12730 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12731 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12732 MinBWs.
contains(getOperandEntry(E, 1)) ||
12733 MinBWs.
contains(getOperandEntry(E, 2))) &&
12734 "Expected item in MinBWs.");
12735 if (True->
getType() != VecTy)
12736 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
12737 if (False->
getType() != VecTy)
12738 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
12742 V = FinalShuffle(V, E, VecTy);
12744 E->VectorizedValue =
V;
12745 ++NumVectorInstructions;
12748 case Instruction::FNeg: {
12749 setInsertPointAfterBundle(E);
12751 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
12753 if (E->VectorizedValue) {
12754 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12755 return E->VectorizedValue;
12761 if (
auto *
I = dyn_cast<Instruction>(V))
12764 V = FinalShuffle(V, E, VecTy);
12766 E->VectorizedValue =
V;
12767 ++NumVectorInstructions;
12771 case Instruction::Add:
12772 case Instruction::FAdd:
12773 case Instruction::Sub:
12774 case Instruction::FSub:
12775 case Instruction::Mul:
12776 case Instruction::FMul:
12777 case Instruction::UDiv:
12778 case Instruction::SDiv:
12779 case Instruction::FDiv:
12780 case Instruction::URem:
12781 case Instruction::SRem:
12782 case Instruction::FRem:
12783 case Instruction::Shl:
12784 case Instruction::LShr:
12785 case Instruction::AShr:
12786 case Instruction::And:
12787 case Instruction::Or:
12788 case Instruction::Xor: {
12789 setInsertPointAfterBundle(E);
12791 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
12792 if (E->VectorizedValue) {
12793 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12794 return E->VectorizedValue;
12796 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
12797 if (E->VectorizedValue) {
12798 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12799 return E->VectorizedValue;
12803 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12804 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12805 MinBWs.
contains(getOperandEntry(E, 0)) ||
12806 MinBWs.
contains(getOperandEntry(E, 1))) &&
12807 "Expected item in MinBWs.");
12818 if (
auto *
I = dyn_cast<Instruction>(V)) {
12821 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
12823 return isCommutative(cast<Instruction>(V));
12825 I->setHasNoUnsignedWrap(
false);
12828 V = FinalShuffle(V, E, VecTy);
12830 E->VectorizedValue =
V;
12831 ++NumVectorInstructions;
12835 case Instruction::Load: {
12838 setInsertPointAfterBundle(E);
12840 LoadInst *LI = cast<LoadInst>(VL0);
12843 if (E->State == TreeEntry::Vectorize) {
12845 }
else if (E->State == TreeEntry::StridedVectorize) {
12846 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12847 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12848 PO = IsReverseOrder ? PtrN : Ptr0;
12854 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
12856 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12857 DL->getTypeAllocSize(ScalarTy));
12861 return cast<LoadInst>(V)->getPointerOperand();
12864 std::optional<Value *> Stride =
12873 (IsReverseOrder ? -1 : 1) *
12874 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
12876 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12878 Intrinsic::experimental_vp_strided_load,
12879 {VecTy, PO->
getType(), StrideTy},
12881 Builder.
getInt32(E->Scalars.size())});
12887 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
12888 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12889 if (E->VectorizedValue) {
12890 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12891 return E->VectorizedValue;
12894 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12899 V = FinalShuffle(V, E, VecTy);
12900 E->VectorizedValue =
V;
12901 ++NumVectorInstructions;
12904 case Instruction::Store: {
12905 auto *
SI = cast<StoreInst>(VL0);
12907 setInsertPointAfterBundle(E);
12909 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12910 if (VecValue->
getType() != VecTy)
12912 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12913 VecValue = FinalShuffle(VecValue, E, VecTy);
12921 E->VectorizedValue =
V;
12922 ++NumVectorInstructions;
12925 case Instruction::GetElementPtr: {
12926 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12927 setInsertPointAfterBundle(E);
12929 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12930 if (E->VectorizedValue) {
12931 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12932 return E->VectorizedValue;
12936 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
12937 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12938 if (E->VectorizedValue) {
12939 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12940 return E->VectorizedValue;
12945 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12946 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
12948 for (
Value *V : E->Scalars) {
12949 if (isa<GetElementPtrInst>(V))
12955 V = FinalShuffle(V, E, VecTy);
12957 E->VectorizedValue =
V;
12958 ++NumVectorInstructions;
12962 case Instruction::Call: {
12963 CallInst *CI = cast<CallInst>(VL0);
12964 setInsertPointAfterBundle(E);
12970 It != MinBWs.
end() ? It->second.first : 0);
12973 VecCallCosts.first <= VecCallCosts.second;
12975 Value *ScalarArg =
nullptr;
12981 auto *CEI = cast<CallInst>(VL0);
12982 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
12987 ScalarArg = CEI->getArgOperand(
I);
12990 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
12991 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
12999 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
13000 if (E->VectorizedValue) {
13001 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13002 return E->VectorizedValue;
13004 ScalarArg = CEI->getArgOperand(
I);
13005 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
13007 It == MinBWs.
end()) {
13010 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
13011 }
else if (It != MinBWs.
end()) {
13012 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
13021 if (!UseIntrinsic) {
13037 V = FinalShuffle(V, E, VecTy);
13039 E->VectorizedValue =
V;
13040 ++NumVectorInstructions;
13043 case Instruction::ShuffleVector: {
13044 assert(E->isAltShuffle() &&
13049 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13050 "Invalid Shuffle Vector Operand");
13054 setInsertPointAfterBundle(E);
13055 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13056 if (E->VectorizedValue) {
13057 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13058 return E->VectorizedValue;
13060 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13062 setInsertPointAfterBundle(E);
13063 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13065 if (E->VectorizedValue) {
13066 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13067 return E->VectorizedValue;
13074 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13075 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13076 MinBWs.
contains(getOperandEntry(E, 0)) ||
13077 MinBWs.
contains(getOperandEntry(E, 1))) &&
13078 "Expected item in MinBWs.");
13079 Type *CastTy = VecTy;
13083 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
13085 ->getIntegerBitWidth())
13102 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13103 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13104 auto *AltCI = cast<CmpInst>(E->getAltOp());
13106 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13109 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13110 cast<VectorType>(
LHS->
getType())->getElementType());
13111 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13112 if (BWSz <= SrcBWSz) {
13113 if (BWSz < SrcBWSz)
13116 if (
auto *
I = dyn_cast<Instruction>(LHS))
13118 E->VectorizedValue =
LHS;
13119 ++NumVectorInstructions;
13130 for (
Value *V : {V0, V1}) {
13131 if (
auto *
I = dyn_cast<Instruction>(V)) {
13132 GatherShuffleExtractSeq.
insert(
I);
13133 CSEBlocks.
insert(
I->getParent());
13142 E->buildAltOpShuffleMask(
13144 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13148 Mask, &OpScalars, &AltScalars);
13152 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13154 if (
auto *
I = dyn_cast<Instruction>(Vec);
13155 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13157 auto *IV = cast<Instruction>(V);
13158 return IV->getOpcode() == Instruction::Sub &&
13159 isCommutative(cast<Instruction>(IV));
13161 I->setHasNoUnsignedWrap(
false);
13163 DropNuwFlag(V0, E->getOpcode());
13164 DropNuwFlag(V1, E->getAltOpcode());
13167 if (
auto *
I = dyn_cast<Instruction>(V)) {
13169 GatherShuffleExtractSeq.
insert(
I);
13170 CSEBlocks.
insert(
I->getParent());
13173 E->VectorizedValue =
V;
13174 ++NumVectorInstructions;
13187 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13193struct ShuffledInsertData {
13206 for (
auto &BSIter : BlocksSchedules) {
13207 scheduleBlock(BSIter.second.get());
13211 EntryToLastInstruction.
clear();
13221 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13222 if (TE->State == TreeEntry::Vectorize &&
13223 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13224 TE->VectorizedValue)
13230 for (
const TreeEntry *E : PostponedNodes) {
13231 auto *TE =
const_cast<TreeEntry *
>(E);
13232 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13233 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13234 TE->UserTreeIndices.front().EdgeIdx)) &&
13235 VecTE->isSame(TE->Scalars))
13239 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13240 TE->VectorizedValue =
nullptr;
13242 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13251 if (isa<PHINode>(UserI)) {
13254 for (
User *U : PrevVec->users()) {
13257 auto *UI = dyn_cast<Instruction>(U);
13258 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13260 if (UI->comesBefore(InsertPt))
13269 if (Vec->
getType() != PrevVec->getType()) {
13271 PrevVec->getType()->isIntOrIntVectorTy() &&
13272 "Expected integer vector types only.");
13273 std::optional<bool> IsSigned;
13274 for (
Value *V : TE->Scalars) {
13275 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
13276 auto It = MinBWs.
find(BaseTE);
13277 if (It != MinBWs.
end()) {
13278 IsSigned = IsSigned.value_or(
false) || It->second.second;
13282 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
13283 auto It = MinBWs.
find(MNTE);
13284 if (It != MinBWs.
end()) {
13285 IsSigned = IsSigned.value_or(
false) || It->second.second;
13290 if (IsSigned.value_or(
false))
13293 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13294 auto It = MinBWs.
find(BVE);
13295 if (It != MinBWs.
end()) {
13296 IsSigned = IsSigned.value_or(
false) || It->second.second;
13301 if (IsSigned.value_or(
false))
13303 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
13305 IsSigned.value_or(
false) ||
13309 if (IsSigned.value_or(
false))
13313 if (IsSigned.value_or(
false)) {
13315 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
13316 if (It != MinBWs.
end())
13317 IsSigned = It->second.second;
13320 "Expected user node or perfect diamond match in MinBWs.");
13324 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13327 auto It = PostponedValues.
find(PrevVec);
13328 if (It != PostponedValues.
end()) {
13329 for (TreeEntry *VTE : It->getSecond())
13330 VTE->VectorizedValue = Vec;
13350 for (
const auto &ExternalUse : ExternalUses) {
13351 Value *Scalar = ExternalUse.Scalar;
13358 TreeEntry *E = getTreeEntry(Scalar);
13359 assert(E &&
"Invalid scalar");
13360 assert(E->State != TreeEntry::NeedToGather &&
13361 "Extracting from a gather list");
13363 if (E->getOpcode() == Instruction::GetElementPtr &&
13364 !isa<GetElementPtrInst>(Scalar))
13367 Value *Vec = E->VectorizedValue;
13368 assert(Vec &&
"Can't find vectorizable value");
13371 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13372 if (Scalar->getType() != Vec->
getType()) {
13373 Value *Ex =
nullptr;
13374 Value *ExV =
nullptr;
13375 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13377 auto It = ScalarToEEs.find(Scalar);
13378 if (It != ScalarToEEs.end()) {
13382 if (EEIt != It->second.end()) {
13388 if (
auto *CI = EEIt->second.second)
13392 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13397 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13398 Value *V = ES->getVectorOperand();
13399 if (
const TreeEntry *ETE = getTreeEntry(V))
13400 V = ETE->VectorizedValue;
13402 }
else if (ReplaceGEP) {
13405 auto *CloneGEP =
GEP->clone();
13406 if (isa<Instruction>(Vec))
13410 CloneGEP->insertBefore(
GEP);
13411 if (
GEP->hasName())
13412 CloneGEP->takeName(
GEP);
13420 if (Scalar->getType() != Ex->
getType())
13422 MinBWs.
find(E)->second.second);
13423 if (
auto *
I = dyn_cast<Instruction>(Ex))
13424 ScalarToEEs[Scalar].try_emplace(
13426 std::make_pair(
I, cast<Instruction>(ExV)));
13430 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13431 GatherShuffleExtractSeq.
insert(ExI);
13432 CSEBlocks.
insert(ExI->getParent());
13436 assert(isa<FixedVectorType>(Scalar->getType()) &&
13437 isa<InsertElementInst>(Scalar) &&
13438 "In-tree scalar of vector type is not insertelement?");
13439 auto *IE = cast<InsertElementInst>(Scalar);
13447 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13452 if (ExternalUsesAsGEPs.contains(U))
13454 TreeEntry *UseEntry = getTreeEntry(U);
13456 (UseEntry->State == TreeEntry::Vectorize ||
13458 TreeEntry::StridedVectorize) &&
13459 (E->State == TreeEntry::Vectorize ||
13460 E->State == TreeEntry::StridedVectorize) &&
13461 doesInTreeUserNeedToExtract(
13463 cast<Instruction>(UseEntry->Scalars.front()),
13466 "Scalar with nullptr User must be registered in "
13467 "ExternallyUsedValues map or remain as scalar in vectorized "
13469 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13470 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13472 PHI->getParent()->getFirstNonPHIIt());
13475 std::next(VecI->getIterator()));
13479 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13481 Scalar->replaceAllUsesWith(NewInst);
13482 ReplacedExternals.emplace_back(Scalar, NewInst);
13486 if (
auto *VU = dyn_cast<InsertElementInst>(
User)) {
13488 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13489 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13490 if (!UsedInserts.
insert(VU).second)
13493 auto BWIt = MinBWs.
find(E);
13495 auto *ScalarTy = FTy->getElementType();
13496 auto Key = std::make_pair(Vec, ScalarTy);
13497 auto VecIt = VectorCasts.
find(Key);
13498 if (VecIt == VectorCasts.
end()) {
13500 if (
auto *IVec = dyn_cast<Instruction>(Vec))
13506 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
13507 BWIt->second.second);
13510 Vec = VecIt->second;
13517 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
13524 unsigned Idx = *InsertIdx;
13525 if (It == ShuffledInserts.
end()) {
13527 It = std::next(ShuffledInserts.
begin(),
13528 ShuffledInserts.
size() - 1);
13534 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
13535 if (IEBase !=
User &&
13536 (!IEBase->hasOneUse() ||
13540 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
13542 IEBase = cast<InsertElementInst>(
Base);
13545 "InsertElementInstruction used already.");
13546 Mask[IEIdx] = IEIdx;
13547 Base = IEBase->getOperand(0);
13548 }
while (E == getTreeEntry(
Base));
13551 Base = cast<InsertElementInst>(
Base)->getOperand(0);
13555 auto It = VectorToInsertElement.
find(
Base);
13556 if (It != VectorToInsertElement.
end())
13563 Mask[
Idx] = ExternalUse.Lane;
13564 It->InsertElements.push_back(cast<InsertElementInst>(
User));
13573 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13575 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13576 if (PH->getIncomingValue(
I) == Scalar) {
13578 PH->getIncomingBlock(
I)->getTerminator();
13579 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13581 std::next(VecI->getIterator()));
13585 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13586 PH->setOperand(
I, NewInst);
13591 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13596 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13606 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13607 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
13609 CombinedMask1[
I] = Mask[
I];
13611 CombinedMask2[
I] = Mask[
I] - VF;
13614 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
13615 ShuffleBuilder.
add(V1, CombinedMask1);
13617 ShuffleBuilder.
add(V2, CombinedMask2);
13618 return ShuffleBuilder.
finalize(std::nullopt);
13622 bool ForSingleMask) {
13623 unsigned VF = Mask.size();
13624 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
13626 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
13627 Vec = CreateShuffle(Vec,
nullptr, Mask);
13628 return std::make_pair(Vec,
true);
13630 if (!ForSingleMask) {
13632 for (
unsigned I = 0;
I < VF; ++
I) {
13634 ResizeMask[Mask[
I]] = Mask[
I];
13636 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
13640 return std::make_pair(Vec,
false);
13644 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
13650 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
13651 Value *NewInst = performExtractsShuffleAction<Value>(
13655 return cast<VectorType>(Vec->getType())
13656 ->getElementCount()
13657 .getKnownMinValue();
13662 assert((Vals.size() == 1 || Vals.size() == 2) &&
13663 "Expected exactly 1 or 2 input values.");
13664 if (Vals.size() == 1) {
13667 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13668 ->getNumElements() ||
13669 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13670 return CreateShuffle(Vals.front(), nullptr, Mask);
13671 return Vals.front();
13673 return CreateShuffle(Vals.
front() ? Vals.
front()
13675 Vals.
back(), Mask);
13677 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
13680 if (It != ShuffledInserts[
I].InsertElements.
rend())
13683 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
13684 assert(II &&
"Must be an insertelement instruction.");
13688 Inserts.
push_back(cast<Instruction>(II));
13689 II = dyn_cast<InsertElementInst>(II->
getOperand(0));
13693 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
13700 IE->replaceUsesOfWith(IE->getOperand(0),
13702 IE->replaceUsesOfWith(IE->getOperand(1),
13706 CSEBlocks.
insert(LastInsert->getParent());
13711 for (
auto &TEPtr : VectorizableTree) {
13712 TreeEntry *Entry = TEPtr.get();
13715 if (Entry->State == TreeEntry::NeedToGather)
13718 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
13721 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13722 Value *Scalar = Entry->Scalars[Lane];
13724 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13725 !isa<GetElementPtrInst>(Scalar))
13728 Type *Ty = Scalar->getType();
13730 for (
User *U : Scalar->users()) {
13734 assert((getTreeEntry(U) ||
13735 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13736 (isa_and_nonnull<Instruction>(U) &&
13737 isDeleted(cast<Instruction>(U)))) &&
13738 "Deleting out-of-tree value");
13742 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
13747 RemovedInsts.
push_back(cast<Instruction>(Scalar));
13753 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13754 V->mergeDIAssignID(RemovedInsts);
13757 InstrElementSize.
clear();
13759 const TreeEntry &RootTE = *VectorizableTree.front().get();
13760 Value *Vec = RootTE.VectorizedValue;
13761 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13762 It != MinBWs.end() &&
13763 ReductionBitWidth != It->second.first) {
13766 ReductionRoot->getIterator());
13770 cast<VectorType>(Vec->
getType())->getElementCount()),
13771 It->second.second);
13778 <<
" gather sequences instructions.\n");
13785 Loop *L = LI->getLoopFor(
I->getParent());
13790 BasicBlock *PreHeader = L->getLoopPreheader();
13798 auto *OpI = dyn_cast<Instruction>(V);
13799 return OpI && L->contains(OpI);
13805 CSEBlocks.
insert(PreHeader);
13820 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
13821 "Different nodes should have different DFS numbers");
13822 return A->getDFSNumIn() <
B->getDFSNumIn();
13832 if (I1->getType() != I2->getType())
13834 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13835 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13837 return I1->isIdenticalTo(I2);
13838 if (SI1->isIdenticalTo(SI2))
13840 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
13841 if (SI1->getOperand(
I) != SI2->getOperand(
I))
13844 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13848 unsigned LastUndefsCnt = 0;
13849 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
13855 NewMask[
I] != SM1[
I])
13858 NewMask[
I] = SM1[
I];
13862 return SM1.
size() - LastUndefsCnt > 1 &&
13866 SM1.
size() - LastUndefsCnt));
13872 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
13875 "Worklist not sorted properly!");
13881 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13882 !GatherShuffleExtractSeq.contains(&In))
13887 bool Replaced =
false;
13890 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13891 DT->
dominates(V->getParent(), In.getParent())) {
13892 In.replaceAllUsesWith(V);
13894 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
13895 if (!NewMask.
empty())
13896 SI->setShuffleMask(NewMask);
13900 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13901 GatherShuffleExtractSeq.contains(V) &&
13902 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13903 DT->
dominates(In.getParent(), V->getParent())) {
13905 V->replaceAllUsesWith(&In);
13907 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13908 if (!NewMask.
empty())
13909 SI->setShuffleMask(NewMask);
13917 Visited.push_back(&In);
13922 GatherShuffleExtractSeq.clear();
13925BoUpSLP::ScheduleData *
13927 ScheduleData *Bundle =
nullptr;
13928 ScheduleData *PrevInBundle =
nullptr;
13929 for (
Value *V : VL) {
13932 ScheduleData *BundleMember = getScheduleData(V);
13934 "no ScheduleData for bundle member "
13935 "(maybe not in same basic block)");
13936 assert(BundleMember->isSchedulingEntity() &&
13937 "bundle member already part of other bundle");
13938 if (PrevInBundle) {
13939 PrevInBundle->NextInBundle = BundleMember;
13941 Bundle = BundleMember;
13945 BundleMember->FirstInBundle = Bundle;
13946 PrevInBundle = BundleMember;
13948 assert(Bundle &&
"Failed to find schedule bundle");
13954std::optional<BoUpSLP::ScheduleData *>
13956 const InstructionsState &S) {
13967 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
13968 ScheduleData *Bundle) {
13974 if (ScheduleEnd != OldScheduleEnd) {
13975 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
13976 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
13981 <<
" in block " << BB->
getName() <<
"\n");
13982 calculateDependencies(Bundle,
true, SLP);
13987 initialFillReadyList(ReadyInsts);
13994 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13995 !ReadyInsts.empty()) {
13996 ScheduleData *Picked = ReadyInsts.pop_back_val();
13997 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13998 "must be ready to schedule");
13999 schedule(Picked, ReadyInsts);
14005 for (
Value *V : VL) {
14008 if (!extendSchedulingRegion(V, S)) {
14015 TryScheduleBundleImpl(
false,
nullptr);
14016 return std::nullopt;
14020 bool ReSchedule =
false;
14021 for (
Value *V : VL) {
14024 ScheduleData *BundleMember = getScheduleData(V);
14026 "no ScheduleData for bundle member (maybe not in same basic block)");
14030 ReadyInsts.remove(BundleMember);
14032 if (!BundleMember->IsScheduled)
14037 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
14038 <<
" was already scheduled\n");
14042 auto *Bundle = buildBundle(VL);
14043 TryScheduleBundleImpl(ReSchedule, Bundle);
14044 if (!Bundle->isReady()) {
14045 cancelScheduling(VL, S.OpValue);
14046 return std::nullopt;
14059 ScheduleData *Bundle = getScheduleData(OpValue);
14060 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
14061 assert(!Bundle->IsScheduled &&
14062 "Can't cancel bundle which is already scheduled");
14063 assert(Bundle->isSchedulingEntity() &&
14065 "tried to unbundle something which is not a bundle");
14068 if (Bundle->isReady())
14069 ReadyInsts.remove(Bundle);
14072 ScheduleData *BundleMember = Bundle;
14073 while (BundleMember) {
14074 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
14075 BundleMember->FirstInBundle = BundleMember;
14076 ScheduleData *Next = BundleMember->NextInBundle;
14077 BundleMember->NextInBundle =
nullptr;
14078 BundleMember->TE =
nullptr;
14079 if (BundleMember->unscheduledDepsInBundle() == 0) {
14080 ReadyInsts.insert(BundleMember);
14082 BundleMember = Next;
14086BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14088 if (ChunkPos >= ChunkSize) {
14089 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14092 return &(ScheduleDataChunks.back()[ChunkPos++]);
14095bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
14096 const InstructionsState &S) {
14097 if (getScheduleData(V,
isOneOf(S, V)))
14100 assert(
I &&
"bundle member must be an instruction");
14103 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14105 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
14106 ScheduleData *ISD = getScheduleData(
I);
14109 assert(isInSchedulingRegion(ISD) &&
14110 "ScheduleData not in scheduling region");
14111 ScheduleData *SD = allocateScheduleDataChunks();
14113 SD->init(SchedulingRegionID, S.OpValue);
14114 ExtraScheduleDataMap[
I][S.OpValue] = SD;
14117 if (CheckScheduleForI(
I))
14119 if (!ScheduleStart) {
14121 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
14123 ScheduleEnd =
I->getNextNode();
14125 CheckScheduleForI(
I);
14126 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14127 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
14135 ++ScheduleStart->getIterator().getReverse();
14140 if (
auto *II = dyn_cast<IntrinsicInst>(&
I))
14141 return II->isAssumeLikeIntrinsic();
14144 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14145 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14146 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
14148 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14149 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
14156 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14157 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14159 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
14160 assert(
I->getParent() == ScheduleStart->getParent() &&
14161 "Instruction is in wrong basic block.");
14162 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
14165 CheckScheduleForI(
I);
14170 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
14171 "Expected to reach top of the basic block or instruction down the "
14173 assert(
I->getParent() == ScheduleEnd->getParent() &&
14174 "Instruction is in wrong basic block.");
14175 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
14177 ScheduleEnd =
I->getNextNode();
14179 CheckScheduleForI(
I);
14180 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14181 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14185void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14187 ScheduleData *PrevLoadStore,
14188 ScheduleData *NextLoadStore) {
14189 ScheduleData *CurrentLoadStore = PrevLoadStore;
14194 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14196 SD = allocateScheduleDataChunks();
14197 ScheduleDataMap[
I] = SD;
14200 assert(!isInSchedulingRegion(SD) &&
14201 "new ScheduleData already in scheduling region");
14202 SD->init(SchedulingRegionID,
I);
14204 if (
I->mayReadOrWriteMemory() &&
14205 (!isa<IntrinsicInst>(
I) ||
14206 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14207 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14208 Intrinsic::pseudoprobe))) {
14210 if (CurrentLoadStore) {
14211 CurrentLoadStore->NextLoadStore = SD;
14213 FirstLoadStoreInRegion = SD;
14215 CurrentLoadStore = SD;
14218 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14219 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14220 RegionHasStackSave =
true;
14222 if (NextLoadStore) {
14223 if (CurrentLoadStore)
14224 CurrentLoadStore->NextLoadStore = NextLoadStore;
14226 LastLoadStoreInRegion = CurrentLoadStore;
14230void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14231 bool InsertInReadyList,
14233 assert(SD->isSchedulingEntity());
14238 while (!WorkList.
empty()) {
14240 for (ScheduleData *BundleMember = SD; BundleMember;
14241 BundleMember = BundleMember->NextInBundle) {
14242 assert(isInSchedulingRegion(BundleMember));
14243 if (BundleMember->hasValidDependencies())
14248 BundleMember->Dependencies = 0;
14249 BundleMember->resetUnscheduledDeps();
14252 if (BundleMember->OpValue != BundleMember->Inst) {
14253 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14254 BundleMember->Dependencies++;
14255 ScheduleData *DestBundle = UseSD->FirstInBundle;
14256 if (!DestBundle->IsScheduled)
14257 BundleMember->incrementUnscheduledDeps(1);
14258 if (!DestBundle->hasValidDependencies())
14262 for (
User *U : BundleMember->Inst->
users()) {
14263 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14264 BundleMember->Dependencies++;
14265 ScheduleData *DestBundle = UseSD->FirstInBundle;
14266 if (!DestBundle->IsScheduled)
14267 BundleMember->incrementUnscheduledDeps(1);
14268 if (!DestBundle->hasValidDependencies())
14275 auto *DepDest = getScheduleData(
I);
14276 assert(DepDest &&
"must be in schedule window");
14277 DepDest->ControlDependencies.push_back(BundleMember);
14278 BundleMember->Dependencies++;
14279 ScheduleData *DestBundle = DepDest->FirstInBundle;
14280 if (!DestBundle->IsScheduled)
14281 BundleMember->incrementUnscheduledDeps(1);
14282 if (!DestBundle->hasValidDependencies())
14290 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14291 I != ScheduleEnd;
I =
I->getNextNode()) {
14296 MakeControlDependent(
I);
14304 if (RegionHasStackSave) {
14308 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14309 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14310 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14311 I != ScheduleEnd;
I =
I->getNextNode()) {
14312 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14313 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14318 if (!isa<AllocaInst>(
I))
14322 MakeControlDependent(
I);
14331 if (isa<AllocaInst>(BundleMember->Inst) ||
14332 BundleMember->Inst->mayReadOrWriteMemory()) {
14333 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14334 I != ScheduleEnd;
I =
I->getNextNode()) {
14335 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14336 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14340 MakeControlDependent(
I);
14347 ScheduleData *DepDest = BundleMember->NextLoadStore;
14352 "NextLoadStore list for non memory effecting bundle?");
14354 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14355 unsigned NumAliased = 0;
14356 unsigned DistToSrc = 1;
14358 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14359 assert(isInSchedulingRegion(DepDest));
14369 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14371 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14378 DepDest->MemoryDependencies.push_back(BundleMember);
14379 BundleMember->Dependencies++;
14380 ScheduleData *DestBundle = DepDest->FirstInBundle;
14381 if (!DestBundle->IsScheduled) {
14382 BundleMember->incrementUnscheduledDeps(1);
14384 if (!DestBundle->hasValidDependencies()) {
14407 if (InsertInReadyList && SD->isReady()) {
14408 ReadyInsts.insert(SD);
14415void BoUpSLP::BlockScheduling::resetSchedule() {
14417 "tried to reset schedule on block which has not been scheduled");
14418 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14419 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14420 assert(isInSchedulingRegion(SD) &&
14421 "ScheduleData not in scheduling region");
14422 SD->IsScheduled =
false;
14423 SD->resetUnscheduledDeps();
14426 ReadyInsts.clear();
14429void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14430 if (!BS->ScheduleStart)
14433 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14440 BS->resetSchedule();
14447 struct ScheduleDataCompare {
14448 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14449 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14452 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14457 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14458 I =
I->getNextNode()) {
14459 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14460 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14463 SD->isPartOfBundle() ==
14465 "scheduler and vectorizer bundle mismatch");
14466 SD->FirstInBundle->SchedulingPriority =
Idx++;
14468 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14469 BS->calculateDependencies(SD,
false,
this);
14472 BS->initialFillReadyList(ReadyInsts);
14474 Instruction *LastScheduledInst = BS->ScheduleEnd;
14477 while (!ReadyInsts.empty()) {
14478 ScheduleData *Picked = *ReadyInsts.begin();
14479 ReadyInsts.erase(ReadyInsts.begin());
14483 for (ScheduleData *BundleMember = Picked; BundleMember;
14484 BundleMember = BundleMember->NextInBundle) {
14488 LastScheduledInst = PickedInst;
14491 BS->schedule(Picked, ReadyInsts);
14495#ifdef EXPENSIVE_CHECKS
14499#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14501 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
14502 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
14503 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14504 assert(SD->IsScheduled &&
"must be scheduled at this point");
14511 BS->ScheduleStart =
nullptr;
14518 if (
auto *Store = dyn_cast<StoreInst>(V))
14519 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14521 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
14524 auto E = InstrElementSize.
find(V);
14525 if (E != InstrElementSize.
end())
14534 if (
auto *
I = dyn_cast<Instruction>(V)) {
14542 Value *FirstNonBool =
nullptr;
14543 while (!Worklist.
empty()) {
14548 auto *Ty =
I->getType();
14549 if (isa<VectorType>(Ty))
14551 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
14558 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
14559 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
14567 for (
Use &U :
I->operands()) {
14568 if (
auto *J = dyn_cast<Instruction>(U.get()))
14569 if (Visited.
insert(J).second &&
14570 (isa<PHINode>(
I) || J->getParent() == Parent)) {
14574 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
14575 FirstNonBool = U.get();
14586 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
14588 Width =
DL->getTypeSizeInBits(V->getType());
14592 InstrElementSize[
I] = Width;
14597bool BoUpSLP::collectValuesToDemote(
14598 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
14600 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
14601 bool IsTruncRoot)
const {
14603 if (
all_of(E.Scalars, IsaPred<Constant>))
14606 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
14615 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
14624 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14628 if (
auto *
I = dyn_cast<Instruction>(V)) {
14630 unsigned BitWidth2 =
14631 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14632 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14638 BitWidth1 = std::min(BitWidth1, BitWidth2);
14643 using namespace std::placeholders;
14644 auto FinalAnalysis = [&]() {
14645 if (!IsProfitableToDemote)
14648 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
14650 if (Res && E.State == TreeEntry::NeedToGather) {
14654 for (
Value *V : E.Scalars) {
14655 auto *EE = dyn_cast<ExtractElementInst>(V);
14658 UniqueBases.
insert(EE->getVectorOperand());
14660 const unsigned VF = E.Scalars.size();
14661 Type *OrigScalarTy = E.Scalars.front()->getType();
14662 if (UniqueBases.
size() <= 2 ||
14670 if (E.State == TreeEntry::NeedToGather || !Visited.
insert(&E).second ||
14672 return all_of(V->users(), [&](User *U) {
14673 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14676 return FinalAnalysis();
14679 return !all_of(V->users(), [=](User *U) {
14680 return getTreeEntry(U) ||
14681 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14682 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14683 !U->getType()->isScalableTy() &&
14684 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14685 }) && !IsPotentiallyTruncated(V,
BitWidth);
14690 bool &NeedToExit) {
14691 NeedToExit =
false;
14692 unsigned InitLevel = MaxDepthLevel;
14694 unsigned Level = InitLevel;
14695 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
14696 ToDemote, Visited, Level, IsProfitableToDemote,
14698 if (!IsProfitableToDemote)
14701 if (!FinalAnalysis())
14705 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14709 auto AttemptCheckBitwidth =
14712 NeedToExit =
false;
14713 unsigned BestFailBitwidth = 0;
14715 if (Checker(
BitWidth, OrigBitWidth))
14717 if (BestFailBitwidth == 0 && FinalAnalysis())
14721 if (BestFailBitwidth == 0) {
14732 auto TryProcessInstruction =
14739 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14744 if (E.UserTreeIndices.size() > 1 &&
14745 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14748 bool NeedToExit =
false;
14749 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14753 if (!ProcessOperands(
Operands, NeedToExit))
14762 return IsProfitableToDemote;
14764 switch (E.getOpcode()) {
14768 case Instruction::Trunc:
14769 if (IsProfitableToDemoteRoot)
14770 IsProfitableToDemote =
true;
14771 return TryProcessInstruction(
BitWidth);
14772 case Instruction::ZExt:
14773 case Instruction::SExt:
14774 IsProfitableToDemote =
true;
14775 return TryProcessInstruction(
BitWidth);
14779 case Instruction::Add:
14780 case Instruction::Sub:
14781 case Instruction::Mul:
14782 case Instruction::And:
14783 case Instruction::Or:
14784 case Instruction::Xor: {
14785 return TryProcessInstruction(
14786 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14788 case Instruction::Shl: {
14793 auto *I = cast<Instruction>(V);
14794 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14795 return AmtKnownBits.getMaxValue().ult(BitWidth);
14798 return TryProcessInstruction(
14799 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14801 case Instruction::LShr: {
14805 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14807 auto *I = cast<Instruction>(V);
14808 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14809 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14810 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14811 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14812 SimplifyQuery(*DL));
14815 return TryProcessInstruction(
14816 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14819 case Instruction::AShr: {
14823 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14825 auto *I = cast<Instruction>(V);
14826 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14827 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14828 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14829 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14833 return TryProcessInstruction(
14834 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14837 case Instruction::UDiv:
14838 case Instruction::URem: {
14840 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14843 auto *I = cast<Instruction>(V);
14844 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14845 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14846 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14849 return TryProcessInstruction(
14850 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14854 case Instruction::Select: {
14855 return TryProcessInstruction(
14856 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14861 case Instruction::PHI: {
14862 const unsigned NumOps = E.getNumOperands();
14865 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
14867 return TryProcessInstruction(
BitWidth, Ops);
14870 case Instruction::Call: {
14871 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14875 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
14876 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
14880 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14883 auto *I = cast<Instruction>(V);
14884 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14885 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14886 return MaskedValueIsZero(I->getOperand(0), Mask,
14887 SimplifyQuery(*DL)) &&
14888 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14890 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
14891 "Expected min/max intrinsics only.");
14892 unsigned SignBits = OrigBitWidth -
BitWidth;
14905 if (
ID != Intrinsic::abs) {
14906 Operands.push_back(getOperandEntry(&E, 1));
14907 CallChecker = CompChecker;
14910 std::numeric_limits<InstructionCost::CostType>::max();
14912 unsigned VF = E.Scalars.size();
14922 if (
Cost < BestCost) {
14928 [[maybe_unused]]
bool NeedToExit;
14929 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14939 return FinalAnalysis();
14946 bool IsStoreOrInsertElt =
14947 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14948 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14949 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14950 ExtraBitWidthNodes.
size() <= 1 &&
14951 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14952 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14955 unsigned NodeIdx = 0;
14956 if (IsStoreOrInsertElt &&
14957 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14961 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14962 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
14963 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14966 static_cast<int>(NodeIdx);
14972 bool IsTruncRoot =
false;
14973 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14975 if (NodeIdx != 0 &&
14976 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14977 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
14978 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
14979 IsTruncRoot =
true;
14981 IsProfitableToDemoteRoot =
true;
14986 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
14990 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
14991 bool IsProfitableToDemoteRoot,
unsigned Opcode,
14992 unsigned Limit,
bool IsTruncRoot,
14993 bool IsSignedCmp) {
14995 unsigned VF = E.getVectorFactor();
14996 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14997 if (!TreeRootIT || !Opcode)
15001 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
15004 unsigned NumParts =
15010 unsigned MaxBitWidth = 1u;
15018 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
15019 KnownBits Known = computeKnownBits(R, *DL);
15020 return Known.isNonNegative();
15025 for (
Value *Root : E.Scalars) {
15028 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15044 if (!IsKnownPositive)
15048 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15050 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15053 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15058 if (NumParts > 1 &&
15064 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15065 Opcode == Instruction::SExt ||
15066 Opcode == Instruction::ZExt || NumParts > 1;
15071 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15072 bool NeedToDemote = IsProfitableToDemote;
15074 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15075 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15077 (MaxDepthLevel <= Limit &&
15078 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15079 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15080 DL->getTypeSizeInBits(TreeRootIT) /
15081 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15087 MaxBitWidth =
bit_ceil(MaxBitWidth);
15089 return MaxBitWidth;
15096 if (UserIgnoreList &&
15097 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15098 for (
Value *V : *UserIgnoreList) {
15100 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
15101 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15104 unsigned BitWidth2 = BitWidth1;
15107 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15109 ReductionBitWidth =
15110 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15112 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15113 ReductionBitWidth = 8;
15115 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
15117 bool IsTopRoot = NodeIdx == 0;
15118 while (NodeIdx < VectorizableTree.size() &&
15119 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15120 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15123 IsTruncRoot =
true;
15125 bool IsSignedCmp =
false;
15126 while (NodeIdx < VectorizableTree.size()) {
15128 unsigned Limit = 2;
15129 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15131 ReductionBitWidth ==
15132 DL->getTypeSizeInBits(
15133 VectorizableTree.front()->Scalars.front()->getType()))
15135 unsigned MaxBitWidth = ComputeMaxBitWidth(
15136 *VectorizableTree[NodeIdx].
get(), IsTopRoot, IsProfitableToDemoteRoot,
15137 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15138 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
15139 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15140 ReductionBitWidth =
bit_ceil(MaxBitWidth);
15141 else if (MaxBitWidth == 0)
15142 ReductionBitWidth = 0;
15145 for (
unsigned Idx : RootDemotes) {
15147 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
15148 if (OrigBitWidth > MaxBitWidth) {
15156 RootDemotes.clear();
15158 IsProfitableToDemoteRoot =
true;
15160 if (ExtraBitWidthNodes.
empty()) {
15161 NodeIdx = VectorizableTree.size();
15163 unsigned NewIdx = 0;
15165 NewIdx = *ExtraBitWidthNodes.
begin();
15166 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
15167 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
15170 NodeIdx < VectorizableTree.size() &&
15171 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15174 EI.
UserTE->getOpcode() == Instruction::Trunc &&
15175 !EI.
UserTE->isAltShuffle();
15178 NodeIdx < VectorizableTree.size() &&
15179 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15181 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
15183 auto *IC = dyn_cast<ICmpInst>(V);
15186 !isKnownNonNegative(IC->getOperand(0),
15187 SimplifyQuery(*DL)) ||
15188 !isKnownNonNegative(IC->getOperand(1),
15189 SimplifyQuery(*DL)));
15196 if (MaxBitWidth == 0 ||
15198 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
15199 if (UserIgnoreList)
15206 for (
unsigned Idx : ToDemote) {
15207 TreeEntry *TE = VectorizableTree[
Idx].get();
15210 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15212 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15230 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15255 DL = &
F.getParent()->getDataLayout();
15259 bool Changed =
false;
15265 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15270 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15273 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15277 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15283 DT->updateDFSNumbers();
15286 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15288 R.clearReductionData();
15289 collectSeedInstructions(BB);
15292 if (!Stores.empty()) {
15294 <<
" underlying objects.\n");
15295 Changed |= vectorizeStoreChains(R);
15299 Changed |= vectorizeChainsInBlock(BB, R);
15304 if (!GEPs.
empty()) {
15306 <<
" underlying objects.\n");
15307 Changed |= vectorizeGEPIndices(BB, R);
15312 R.optimizeGatherSequence();
15320 unsigned Idx,
unsigned MinVF,
15325 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15326 unsigned VF = Chain.
size();
15340 for (
Value *V : Chain)
15341 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
15344 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
15349 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15350 (!S.MainOp->isSafeToRemove() ||
15353 return !isa<ExtractElementInst>(V) &&
15354 (V->getNumUses() > Chain.size() ||
15355 any_of(V->users(), [&](User *U) {
15356 return !Stores.contains(U);
15359 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
15360 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15364 if (
R.isLoadCombineCandidate(Chain))
15366 R.buildTree(Chain);
15368 if (
R.isTreeTinyAndNotFullyVectorizable()) {
15369 if (
R.isGathered(Chain.front()) ||
15370 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15371 return std::nullopt;
15372 Size =
R.getTreeSize();
15375 R.reorderTopToBottom();
15376 R.reorderBottomToTop();
15377 R.buildExternalUses();
15379 R.computeMinimumValueSizes();
15380 R.transformNodes();
15382 Size =
R.getTreeSize();
15383 if (S.getOpcode() == Instruction::Load)
15391 using namespace ore;
15394 cast<StoreInst>(Chain[0]))
15395 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
15396 <<
" and with tree size "
15397 <<
NV(
"TreeSize",
R.getTreeSize()));
15411 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15412 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15413 unsigned Size = First ? Val.first : Val.second;
15425 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15426 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15427 unsigned P = First ? Val.first : Val.second;
15430 return V + (P - Mean) * (P - Mean);
15433 return Dev * 81 / (Mean * Mean) == 0;
15436bool SLPVectorizerPass::vectorizeStores(
15438 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15443 bool Changed =
false;
15445 struct StoreDistCompare {
15446 bool operator()(
const std::pair<unsigned, int> &Op1,
15447 const std::pair<unsigned, int> &Op2)
const {
15448 return Op1.second < Op2.second;
15453 using StoreIndexToDistSet =
15454 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15455 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
15460 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
15462 PrevDist =
Data.second;
15463 if (
Idx != Set.size() - 1)
15468 Operands.push_back(Stores[DataVar.first]);
15469 PrevDist = DataVar.second;
15474 .
insert({Operands.front(),
15475 cast<StoreInst>(Operands.front())->getValueOperand(),
15477 cast<StoreInst>(Operands.back())->getValueOperand(),
15482 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
15483 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
15487 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15488 unsigned MaxRegVF = MaxVF;
15490 Type *StoreTy =
Store->getValueOperand()->getType();
15491 Type *ValueTy = StoreTy;
15492 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
15493 ValueTy = Trunc->getSrcTy();
15494 if (ValueTy == StoreTy &&
15495 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
15497 unsigned MinVF = std::max<unsigned>(
15499 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15502 if (MaxVF < MinVF) {
15503 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
15505 <<
"MinVF (" << MinVF <<
")\n");
15509 unsigned NonPowerOf2VF = 0;
15514 unsigned CandVF =
Operands.size();
15516 NonPowerOf2VF = CandVF;
15521 unsigned Size = MinVF;
15523 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
15527 unsigned Repeat = 0;
15528 constexpr unsigned MaxAttempts = 4;
15530 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
15531 P.first =
P.second = 1;
15534 auto IsNotVectorized = [](
bool First,
15535 const std::pair<unsigned, unsigned> &
P) {
15536 return First ?
P.first > 0 :
P.second > 0;
15538 auto IsVectorized = [](
bool First,
15539 const std::pair<unsigned, unsigned> &
P) {
15540 return First ?
P.first == 0 :
P.second == 0;
15542 auto VFIsProfitable = [](
bool First,
unsigned Size,
15543 const std::pair<unsigned, unsigned> &
P) {
15546 auto FirstSizeSame = [](
unsigned Size,
15547 const std::pair<unsigned, unsigned> &
P) {
15548 return Size ==
P.first;
15552 bool RepeatChanged =
false;
15553 bool AnyProfitableGraph;
15554 for (
unsigned Size : CandidateVFs) {
15555 AnyProfitableGraph =
false;
15556 unsigned StartIdx = std::distance(
15557 RangeSizes.begin(),
15558 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
15559 std::placeholders::_1)));
15560 while (StartIdx <
End) {
15562 std::distance(RangeSizes.begin(),
15563 find_if(RangeSizes.drop_front(StartIdx),
15564 std::bind(IsVectorized,
Size >= MaxRegVF,
15565 std::placeholders::_1)));
15566 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
15567 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
15569 Size >= MaxRegVF)) {
15576 return cast<StoreInst>(V)
15577 ->getValueOperand()
15579 cast<StoreInst>(Slice.
front())
15580 ->getValueOperand()
15583 "Expected all operands of same type.");
15584 if (!NonSchedulable.empty()) {
15585 auto [NonSchedSizeMax, NonSchedSizeMin] =
15586 NonSchedulable.lookup(Slice.
front());
15587 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
15588 Cnt += NonSchedSizeMax;
15593 std::optional<bool> Res =
15594 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15598 .first->getSecond()
15606 AnyProfitableGraph = RepeatChanged = Changed =
true;
15610 [](std::pair<unsigned, unsigned> &
P) {
15611 P.first = P.second = 0;
15613 if (Cnt < StartIdx + MinVF) {
15614 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15615 [](std::pair<unsigned, unsigned> &
P) {
15616 P.first = P.second = 0;
15618 StartIdx = Cnt +
Size;
15620 if (Cnt > Sz -
Size - MinVF) {
15622 [](std::pair<unsigned, unsigned> &
P) {
15623 P.first = P.second = 0;
15632 if (
Size > 2 && Res &&
15634 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
15635 std::placeholders::_1))) {
15641 if (
Size > MaxRegVF && TreeSize > 1 &&
15643 std::bind(FirstSizeSame, TreeSize,
15644 std::placeholders::_1))) {
15646 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15652 [&](std::pair<unsigned, unsigned> &
P) {
15653 if (Size >= MaxRegVF)
15654 P.second = std::max(P.second, TreeSize);
15656 P.first = std::max(P.first, TreeSize);
15659 AnyProfitableGraph =
true;
15661 if (StartIdx >=
End)
15663 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15664 AnyProfitableGraph =
true;
15665 StartIdx = std::distance(
15666 RangeSizes.begin(),
15667 find_if(RangeSizes.drop_front(Sz),
15668 std::bind(IsNotVectorized,
Size >= MaxRegVF,
15669 std::placeholders::_1)));
15671 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
15675 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
15676 return P.first == 0 &&
P.second == 0;
15680 if (Repeat >= MaxAttempts ||
15681 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15683 constexpr unsigned StoresLimit = 64;
15684 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
15686 static_cast<unsigned>(
15689 RangeSizes.begin(),
15690 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
15691 std::placeholders::_1))) +
15694 if (VF > MaxTotalNum || VF >= StoresLimit)
15696 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
15698 P.first = std::max(
P.second,
P.first);
15702 CandidateVFs.clear();
15703 CandidateVFs.push_back(VF);
15750 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15752 Stores[Set.first]->getValueOperand()->getType(),
15753 Stores[Set.first]->getPointerOperand(),
15754 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
15758 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
15759 if (It == Set.second.end()) {
15760 Set.second.emplace(
Idx, *Diff);
15764 TryToVectorize(Set.second);
15765 StoreIndexToDistSet PrevSet;
15766 PrevSet.swap(Set.second);
15768 Set.second.emplace(
Idx, 0);
15771 unsigned StartIdx = It->first + 1;
15776 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
15778 if (Pair.first <= It->first ||
15779 VectorizedStores.
contains(Stores[Pair.first]))
15781 unsigned BI = Pair.first - StartIdx;
15782 UsedStores.set(BI);
15783 Dists[BI] = Pair.second - It->second;
15785 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
15786 unsigned BI =
I - StartIdx;
15787 if (UsedStores.test(BI))
15788 Set.second.emplace(
I, Dists[BI]);
15792 auto &Res = SortedStores.emplace_back();
15794 Res.second.emplace(
Idx, 0);
15800 SI->getValueOperand()->getType()) {
15801 for (
auto &Set : SortedStores)
15802 TryToVectorize(Set.second);
15803 SortedStores.clear();
15806 FillStoresSet(
I, SI);
15810 for (
auto &Set : SortedStores)
15811 TryToVectorize(Set.second);
15816void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
15827 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
15828 if (!
SI->isSimple())
15838 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
15839 if (
GEP->getNumIndices() != 1)
15842 if (isa<Constant>(
Idx))
15846 if (
GEP->getType()->isVectorTy())
15858 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
15859 << VL.
size() <<
".\n");
15864 if (!S.getOpcode())
15870 for (
Value *V : VL) {
15871 Type *Ty =
V->getType();
15875 R.getORE()->emit([&]() {
15876 std::string TypeStr;
15880 <<
"Cannot SLP vectorize list: type "
15881 << rso.str() +
" is unsupported by vectorizer";
15887 unsigned Sz =
R.getVectorElementSize(I0);
15888 unsigned MinVF =
R.getMinVF(Sz);
15889 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
15890 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15892 R.getORE()->emit([&]() {
15894 <<
"Cannot SLP vectorize list: vectorization factor "
15895 <<
"less than 2 is not supported";
15900 bool Changed =
false;
15901 bool CandidateFound =
false;
15903 Type *ScalarTy = VL[0]->getType();
15904 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15905 ScalarTy =
IE->getOperand(1)->getType();
15907 unsigned NextInst = 0, MaxInst = VL.size();
15908 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15915 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
15916 unsigned ActualVF = std::min(MaxInst -
I, VF);
15921 if (MaxVFOnly && ActualVF < MaxVF)
15923 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15929 auto *
I = dyn_cast<Instruction>(V);
15930 return I &&
R.isDeleted(
I);
15934 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
15938 if (
R.isTreeTinyAndNotFullyVectorizable())
15940 R.reorderTopToBottom();
15941 R.reorderBottomToTop(
15942 !isa<InsertElementInst>(Ops.
front()) &&
15943 !
R.doesRootHaveInTreeUses());
15944 R.buildExternalUses();
15946 R.computeMinimumValueSizes();
15947 R.transformNodes();
15949 CandidateFound =
true;
15950 MinCost = std::min(MinCost,
Cost);
15953 <<
" for VF=" << ActualVF <<
"\n");
15957 cast<Instruction>(Ops[0]))
15958 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
15959 <<
" and with tree size "
15960 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
15971 if (!Changed && CandidateFound) {
15972 R.getORE()->emit([&]() {
15974 <<
"List vectorization was possible but not beneficial with cost "
15975 <<
ore::NV(
"Cost", MinCost) <<
" >= "
15978 }
else if (!Changed) {
15979 R.getORE()->emit([&]() {
15981 <<
"Cannot SLP vectorize list: vectorization was impossible"
15982 <<
" with available vectorization factors";
15992 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
15998 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
15999 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
16000 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
16007 auto *
A = dyn_cast<BinaryOperator>(Op0);
16008 auto *
B = dyn_cast<BinaryOperator>(Op1);
16010 if (
A &&
B &&
B->hasOneUse()) {
16011 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
16012 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
16013 if (B0 && B0->getParent() ==
P)
16015 if (B1 && B1->getParent() ==
P)
16019 if (
B &&
A &&
A->hasOneUse()) {
16020 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
16021 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
16022 if (A0 && A0->getParent() ==
P)
16024 if (A1 && A1->getParent() ==
P)
16028 if (Candidates.
size() == 1)
16029 return tryToVectorizeList({Op0, Op1},
R);
16032 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
16033 if (!BestCandidate)
16035 return tryToVectorizeList(
16036 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
16070 ReductionOpsListType ReductionOps;
16082 bool IsSupportedHorRdxIdentityOp =
false;
16093 return isa<SelectInst>(
I) &&
16099 if (Kind == RecurKind::None)
16107 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16111 return I->getFastMathFlags().noNaNs();
16114 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16117 return I->isAssociative();
16126 return I->getOperand(2);
16127 return I->getOperand(
Index);
16135 case RecurKind::Or:
16141 case RecurKind::And:
16147 case RecurKind::Add:
16148 case RecurKind::Mul:
16149 case RecurKind::Xor:
16150 case RecurKind::FAdd:
16151 case RecurKind::FMul:
16154 case RecurKind::FMax:
16156 case RecurKind::FMin:
16158 case RecurKind::FMaximum:
16160 case RecurKind::FMinimum:
16162 case RecurKind::SMax:
16168 case RecurKind::SMin:
16174 case RecurKind::UMax:
16180 case RecurKind::UMin:
16195 const ReductionOpsListType &ReductionOps) {
16196 bool UseSelect = ReductionOps.size() == 2 ||
16198 (ReductionOps.size() == 1 &&
16199 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16200 assert((!UseSelect || ReductionOps.size() != 2 ||
16201 isa<SelectInst>(ReductionOps[1][0])) &&
16202 "Expected cmp + select pairs for reduction");
16205 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
16219 auto *
I = dyn_cast<Instruction>(V);
16221 return RecurKind::None;
16223 return RecurKind::Add;
16225 return RecurKind::Mul;
16228 return RecurKind::And;
16231 return RecurKind::Or;
16233 return RecurKind::Xor;
16235 return RecurKind::FAdd;
16237 return RecurKind::FMul;
16240 return RecurKind::FMax;
16242 return RecurKind::FMin;
16245 return RecurKind::FMaximum;
16247 return RecurKind::FMinimum;
16253 return RecurKind::SMax;
16255 return RecurKind::SMin;
16257 return RecurKind::UMax;
16259 return RecurKind::UMin;
16261 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
16283 if (!isa<ExtractElementInst>(
RHS) ||
16285 return RecurKind::None;
16287 if (!isa<ExtractElementInst>(
LHS) ||
16289 return RecurKind::None;
16291 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
16292 return RecurKind::None;
16296 return RecurKind::None;
16301 return RecurKind::None;
16304 return RecurKind::SMax;
16307 return RecurKind::SMin;
16310 return RecurKind::UMax;
16313 return RecurKind::UMin;
16316 return RecurKind::None;
16320 static unsigned getFirstOperandIndex(
Instruction *
I) {
16321 return isCmpSelMinMax(
I) ? 1 : 0;
16327 return isCmpSelMinMax(
I) ? 3 : 2;
16333 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
16334 auto *Sel = cast<SelectInst>(
I);
16335 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
16336 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
16338 return I->getParent() == BB;
16342 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
16343 if (IsCmpSelMinMax) {
16346 if (
auto *Sel = dyn_cast<SelectInst>(
I))
16347 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
16348 return I->hasNUses(2);
16352 return I->hasOneUse();
16357 if (isCmpSelMinMax(
I))
16358 ReductionOps.assign(2, ReductionOpsType());
16360 ReductionOps.assign(1, ReductionOpsType());
16365 if (isCmpSelMinMax(
I)) {
16366 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
16367 ReductionOps[1].emplace_back(
I);
16369 ReductionOps[0].emplace_back(
I);
16374 int Sz = Data.size();
16375 auto *
I = dyn_cast<Instruction>(Data.front());
16376 return Sz > 1 ||
isConstant(Data.front()) ||
16387 RdxKind = HorizontalReduction::getRdxKind(Root);
16388 if (!isVectorizable(RdxKind, Root))
16399 if (
auto *Sel = dyn_cast<SelectInst>(Root))
16400 if (!Sel->getCondition()->hasOneUse())
16403 ReductionRoot = Root;
16408 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16417 for (
int I = getFirstOperandIndex(TreeN),
16418 End = getNumberOfOperands(TreeN);
16420 Value *EdgeVal = getRdxOperand(TreeN,
I);
16421 ReducedValsToOps[EdgeVal].push_back(TreeN);
16422 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16425 !hasSameParent(EdgeInst, BB)) {
16426 ExtraArgs.push_back(EdgeVal);
16433 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16434 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16435 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16436 !isVectorizable(RdxKind, EdgeInst) ||
16437 (
R.isAnalyzedReductionRoot(EdgeInst) &&
16438 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16439 PossibleReducedVals.push_back(EdgeVal);
16442 ReductionOps.push_back(EdgeInst);
16451 PossibleReducedVals;
16452 initReductionOps(Root);
16457 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
16460 auto LIt = LoadsMap.
find(
Ptr);
16461 if (LIt != LoadsMap.
end()) {
16462 for (
LoadInst *RLI : LIt->second) {
16468 for (
LoadInst *RLI : LIt->second) {
16472 DoNotReverseVals.
insert(RLI);
16476 if (LIt->second.size() > 2) {
16478 hash_value(LIt->second.back()->getPointerOperand());
16479 DoNotReverseVals.
insert(LIt->second.back());
16484 LoadKeyUsed.
insert(Key);
16489 while (!Worklist.empty()) {
16494 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16497 if (
Args.size() < 2) {
16498 addReductionOps(TreeN);
16500 if (!
Args.empty()) {
16501 assert(
Args.size() == 1 &&
"Expected only single argument.");
16502 ExtraArgs[TreeN] =
Args.front();
16506 for (
Value *V : PossibleRedVals) {
16510 ++PossibleReducedVals[
Key][
Idx]
16511 .
insert(std::make_pair(V, 0))
16514 Worklist.append(PossibleReductionOps.
rbegin(),
16515 PossibleReductionOps.
rend());
16520 ++PossibleReducedVals[
Key][
Idx]
16521 .
insert(std::make_pair(TreeN, 0))
16525 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
16528 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
16529 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
16531 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
16534 auto RedValsVect = It->second.takeVector();
16536 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
16537 PossibleRedValsVect.
back().append(Data.second, Data.first);
16539 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
16540 return P1.size() > P2.size();
16544 if (isGoodForReduction(Data) ||
16545 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16546 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16548 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16552 NewIdx = ReducedVals.
size();
16555 if (DoNotReverseVals.
contains(Data.front()))
16556 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
16558 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
16560 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
16575 constexpr int ReductionLimit = 4;
16576 constexpr unsigned RegMaxNumber = 4;
16577 constexpr unsigned RedValsMaxNumber = 128;
16581 unsigned NumReducedVals =
16582 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
16584 if (!isGoodForReduction(Vals))
16586 return Num + Vals.size();
16588 if (NumReducedVals < ReductionLimit &&
16593 for (ReductionOpsType &RdxOps : ReductionOps)
16594 for (
Value *RdxOp : RdxOps)
16595 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16606 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
16609 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
16612 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16613 assert(Pair.first &&
"DebugLoc must be set.");
16614 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16615 TrackedVals.
try_emplace(Pair.second, Pair.second);
16620 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
16621 assert(isa<SelectInst>(RdxRootInst) &&
16622 "Expected min/max reduction to have select root instruction");
16623 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16624 assert(isa<Instruction>(ScalarCond) &&
16625 "Expected min/max reduction to have compare condition");
16626 return cast<Instruction>(ScalarCond);
16630 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
16631 if (VectorizedTree) {
16634 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16635 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16638 auto It = ReducedValsToOps.
find(Res);
16639 if (It != ReducedValsToOps.
end() &&
16645 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
16651 bool AnyBoolLogicOp =
16653 return isBoolLogicOp(cast<Instruction>(V));
16657 ExternallyUsedValues[ReductionRoot];
16659 ReductionOps.front().size());
16660 for (ReductionOpsType &RdxOps : ReductionOps)
16661 for (
Value *RdxOp : RdxOps) {
16664 IgnoreList.insert(RdxOp);
16669 for (
Value *U : IgnoreList)
16670 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
16671 RdxFMF &= FPMO->getFastMathFlags();
16672 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16677 for (
Value *V : Candidates)
16678 TrackedVals.try_emplace(V, V);
16684 Value *VectorizedTree =
nullptr;
16685 bool CheckForReusedReductionOps =
false;
16687 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
16693 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
16694 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16699 auto *Inst = dyn_cast<Instruction>(RdxVal);
16701 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16702 (S.getOpcode() && !Inst))
16705 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16707 bool ShuffledExtracts =
false;
16709 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16711 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
16712 if (NextS.getOpcode() == Instruction::ExtractElement &&
16713 !NextS.isAltShuffle()) {
16715 for (
Value *RV : ReducedVals[
I + 1]) {
16716 Value *RdxVal = TrackedVals.find(RV)->second;
16720 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
16721 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16723 CommonCandidates.push_back(RdxVal);
16724 TrackedToOrig.try_emplace(RdxVal, RV);
16729 Candidates.
swap(CommonCandidates);
16730 ShuffledExtracts =
true;
16739 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
16741 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
16742 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16743 if (
auto *ResI = dyn_cast<Instruction>(Res))
16744 V.analyzedReductionRoot(ResI);
16746 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16750 unsigned NumReducedVals = Candidates.
size();
16751 if (NumReducedVals < ReductionLimit &&
16758 IsSupportedHorRdxIdentityOp =
16760 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16763 if (IsSupportedHorRdxIdentityOp)
16764 for (
Value *V : Candidates)
16765 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
16776 bool SameScaleFactor =
false;
16777 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16778 SameValuesCounter.
size() != Candidates.size();
16779 if (OptReusedScalars) {
16781 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16782 RdxKind == RecurKind::Xor) &&
16784 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
16785 return P.second == SameValuesCounter.
front().second;
16787 Candidates.resize(SameValuesCounter.
size());
16788 transform(SameValuesCounter, Candidates.begin(),
16789 [](
const auto &
P) { return P.first; });
16790 NumReducedVals = Candidates.size();
16792 if (NumReducedVals == 1) {
16793 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16794 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
16796 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16797 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16798 VectorizedVals.try_emplace(OrigV, Cnt);
16803 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
16804 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
16808 unsigned ReduxWidth = std::min<unsigned>(
16810 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16811 RegMaxNumber * RedValsMaxNumber));
16812 unsigned Start = 0;
16813 unsigned Pos = Start;
16815 unsigned PrevReduxWidth = ReduxWidth;
16816 bool CheckForReusedReductionOpsLocal =
false;
16817 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16818 &CheckForReusedReductionOpsLocal,
16819 &PrevReduxWidth, &
V,
16820 &IgnoreList](
bool IgnoreVL =
false) {
16821 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
16822 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16825 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16828 if (Pos < NumReducedVals - ReduxWidth + 1)
16829 return IsAnyRedOpGathered;
16832 return IsAnyRedOpGathered;
16834 bool AnyVectorized =
false;
16835 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16836 ReduxWidth >= ReductionLimit) {
16839 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16841 CheckForReusedReductionOps =
true;
16844 PrevReduxWidth = ReduxWidth;
16847 if (
V.areAnalyzedReductionVals(VL)) {
16848 (void)AdjustReducedVals(
true);
16854 auto *RedValI = dyn_cast<Instruction>(RedVal);
16857 return V.isDeleted(RedValI);
16860 V.buildTree(VL, IgnoreList);
16861 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
16862 if (!AdjustReducedVals())
16863 V.analyzedReductionVals(VL);
16866 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
16867 if (!AdjustReducedVals())
16868 V.analyzedReductionVals(VL);
16871 V.reorderTopToBottom();
16873 V.reorderBottomToTop(
true);
16877 ExternallyUsedValues);
16878 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
16879 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
16881 for (
Value *V : ReducedVals[Cnt])
16882 if (isa<Instruction>(V))
16883 LocalExternallyUsedValues[TrackedVals[
V]];
16885 if (!IsSupportedHorRdxIdentityOp) {
16888 "Reused values counter map is not empty");
16889 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16890 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16892 Value *
V = Candidates[Cnt];
16893 Value *OrigV = TrackedToOrig.find(V)->second;
16894 ++SameValuesCounter[OrigV];
16900 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16901 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16903 Value *RdxVal = Candidates[Cnt];
16904 if (!Visited.
insert(RdxVal).second)
16908 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
16909 LocalExternallyUsedValues[RdxVal];
16912 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16914 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16915 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
16916 LocalExternallyUsedValues[RdxVal];
16919 if (!IsSupportedHorRdxIdentityOp)
16920 SameValuesCounter.
clear();
16921 for (
Value *RdxVal : VL)
16922 if (RequiredExtract.
contains(RdxVal))
16923 LocalExternallyUsedValues[RdxVal];
16927 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16928 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
16929 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16931 auto RIt = ReplacementToExternal.
find(Ext);
16932 while (RIt != ReplacementToExternal.
end()) {
16934 RIt = ReplacementToExternal.
find(Ext);
16936 auto *It = ExternallyUsedValues.
find(Ext);
16937 if (It == ExternallyUsedValues.
end())
16939 LocalExternallyUsedValues[Pair.second].append(It->second);
16941 V.buildExternalUses(LocalExternallyUsedValues);
16943 V.computeMinimumValueSizes();
16944 V.transformNodes();
16949 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16952 <<
" for reduction\n");
16956 V.getORE()->emit([&]() {
16958 SV_NAME,
"HorSLPNotBeneficial",
16959 ReducedValsToOps.
find(VL[0])->second.front())
16960 <<
"Vectorizing horizontal reduction is possible "
16961 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
16962 <<
" and threshold "
16965 if (!AdjustReducedVals())
16966 V.analyzedReductionVals(VL);
16970 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
16971 <<
Cost <<
". (HorRdx)\n");
16972 V.getORE()->emit([&]() {
16974 SV_NAME,
"VectorizedHorizontalReduction",
16975 ReducedValsToOps.
find(VL[0])->second.front())
16976 <<
"Vectorized horizontal reduction with cost "
16977 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
16978 <<
ore::NV(
"TreeSize",
V.getTreeSize());
16985 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16987 if (IsCmpSelMinMax)
16988 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16991 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
16992 ReplacedExternals, InsertPt);
16999 if ((isBoolLogicOp(RdxRootInst) ||
17000 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17002 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
17005 if (OptReusedScalars && !SameScaleFactor) {
17007 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
17008 SameValuesCounter, TrackedToOrig);
17011 Value *ReducedSubTree =
17012 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
17013 if (ReducedSubTree->
getType() != VL.front()->getType()) {
17015 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
17017 R, cast<Instruction>(ReductionOps.front().front())
17019 ->getDataLayout());
17027 if (OptReusedScalars && SameScaleFactor)
17028 ReducedSubTree = emitScaleForReusedOps(
17029 ReducedSubTree, Builder, SameValuesCounter.
front().second);
17031 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17033 for (
Value *RdxVal : VL) {
17034 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17035 if (IsSupportedHorRdxIdentityOp) {
17036 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17039 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17040 if (!
V.isVectorized(RdxVal))
17041 RequiredExtract.
insert(RdxVal);
17046 AnyVectorized =
true;
17048 if (OptReusedScalars && !AnyVectorized) {
17049 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
17050 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
17051 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17052 Value *OrigV = TrackedToOrig.find(
P.first)->second;
17053 VectorizedVals.try_emplace(OrigV,
P.second);
17058 if (VectorizedTree) {
17079 if (!AnyBoolLogicOp)
17081 if (isBoolLogicOp(RedOp1) &&
17082 ((!InitStep &&
LHS == VectorizedTree) ||
17085 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
17086 getRdxOperand(RedOp2, 0) ==
RHS ||
17091 if (
LHS != VectorizedTree)
17102 unsigned Sz = InstVals.
size();
17105 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
17108 Value *RdxVal1 = InstVals[
I].second;
17109 Value *StableRdxVal1 = RdxVal1;
17110 auto It1 = TrackedVals.find(RdxVal1);
17111 if (It1 != TrackedVals.end())
17112 StableRdxVal1 = It1->second;
17113 Value *RdxVal2 = InstVals[
I + 1].second;
17114 Value *StableRdxVal2 = RdxVal2;
17115 auto It2 = TrackedVals.find(RdxVal2);
17116 if (It2 != TrackedVals.end())
17117 StableRdxVal2 = It2->second;
17121 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
17123 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17124 StableRdxVal2,
"op.rdx", ReductionOps);
17125 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
17128 ExtraReds[Sz / 2] = InstVals.
back();
17132 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
17136 for (
Value *RdxVal : Candidates) {
17137 if (!Visited.
insert(RdxVal).second)
17139 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17146 for (
auto &Pair : ExternallyUsedValues) {
17148 for (
auto *
I : Pair.second)
17152 bool InitStep =
true;
17153 while (ExtraReductions.
size() > 1) {
17154 VectorizedTree = ExtraReductions.
front().second;
17156 FinalGen(ExtraReductions, InitStep);
17157 ExtraReductions.
swap(NewReds);
17160 VectorizedTree = ExtraReductions.
front().second;
17162 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17171 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
17178 for (
auto *U :
Ignore->users()) {
17180 "All users must be either in the reduction ops list.");
17183 if (!
Ignore->use_empty()) {
17185 Ignore->replaceAllUsesWith(Undef);
17187 V.eraseInstruction(cast<Instruction>(
Ignore));
17190 }
else if (!CheckForReusedReductionOps) {
17191 for (ReductionOpsType &RdxOps : ReductionOps)
17192 for (
Value *RdxOp : RdxOps)
17193 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17195 return VectorizedTree;
17202 bool IsCmpSelMinMax,
unsigned ReduxWidth,
17205 Type *ScalarTy = ReducedVals.
front()->getType();
17214 int Cnt = ReducedVals.
size();
17215 for (
Value *RdxVal : ReducedVals) {
17220 Cost += GenCostFn();
17225 auto *RdxOp = cast<Instruction>(U);
17226 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17234 Cost += ScalarCost;
17236 Cost += GenCostFn();
17241 case RecurKind::Add:
17242 case RecurKind::Mul:
17243 case RecurKind::Or:
17244 case RecurKind::And:
17245 case RecurKind::Xor:
17246 case RecurKind::FAdd:
17247 case RecurKind::FMul: {
17252 ScalarCost = EvaluateScalarCost([&]() {
17257 case RecurKind::FMax:
17258 case RecurKind::FMin:
17259 case RecurKind::FMaximum:
17260 case RecurKind::FMinimum:
17261 case RecurKind::SMax:
17262 case RecurKind::SMin:
17263 case RecurKind::UMax:
17264 case RecurKind::UMin: {
17268 ScalarCost = EvaluateScalarCost([&]() {
17278 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
17280 <<
" (It is a splitting reduction)\n");
17281 return VectorCost - ScalarCost;
17287 assert(VectorizedValue &&
"Need to have a vectorized tree node");
17289 "We only handle power-of-two reductions for now");
17290 assert(RdxKind != RecurKind::FMulAdd &&
17291 "A call to the llvm.fmuladd intrinsic is not handled yet");
17293 ++NumVectorInstructions;
17300 assert(IsSupportedHorRdxIdentityOp &&
17301 "The optimization of matched scalar identity horizontal reductions "
17302 "must be supported.");
17304 case RecurKind::Add: {
17306 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
17308 << VectorizedValue <<
". (HorRdx)\n");
17309 return Builder.
CreateMul(VectorizedValue, Scale);
17311 case RecurKind::Xor: {
17313 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
17314 <<
". (HorRdx)\n");
17317 return VectorizedValue;
17319 case RecurKind::FAdd: {
17321 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
17323 << VectorizedValue <<
". (HorRdx)\n");
17324 return Builder.
CreateFMul(VectorizedValue, Scale);
17326 case RecurKind::And:
17327 case RecurKind::Or:
17328 case RecurKind::SMax:
17329 case RecurKind::SMin:
17330 case RecurKind::UMax:
17331 case RecurKind::UMin:
17332 case RecurKind::FMax:
17333 case RecurKind::FMin:
17334 case RecurKind::FMaximum:
17335 case RecurKind::FMinimum:
17337 return VectorizedValue;
17338 case RecurKind::Mul:
17339 case RecurKind::FMul:
17340 case RecurKind::FMulAdd:
17341 case RecurKind::IAnyOf:
17342 case RecurKind::FAnyOf:
17343 case RecurKind::None:
17355 assert(IsSupportedHorRdxIdentityOp &&
17356 "The optimization of matched scalar identity horizontal reductions "
17357 "must be supported.");
17358 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
17359 if (VTy->getElementType() != VL.
front()->getType()) {
17365 R, cast<Instruction>(ReductionOps.front().front())
17367 ->getDataLayout());
17372 case RecurKind::Add: {
17375 for (
Value *V : VL) {
17376 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17377 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
17381 << VectorizedValue <<
". (HorRdx)\n");
17382 return Builder.
CreateMul(VectorizedValue, Scale);
17384 case RecurKind::And:
17385 case RecurKind::Or:
17388 <<
". (HorRdx)\n");
17389 return VectorizedValue;
17390 case RecurKind::SMax:
17391 case RecurKind::SMin:
17392 case RecurKind::UMax:
17393 case RecurKind::UMin:
17394 case RecurKind::FMax:
17395 case RecurKind::FMin:
17396 case RecurKind::FMaximum:
17397 case RecurKind::FMinimum:
17400 <<
". (HorRdx)\n");
17401 return VectorizedValue;
17402 case RecurKind::Xor: {
17408 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
17410 std::iota(
Mask.begin(),
Mask.end(), 0);
17411 bool NeedShuffle =
false;
17412 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
17414 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17415 if (Cnt % 2 == 0) {
17417 NeedShuffle =
true;
17423 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
17427 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
17428 return VectorizedValue;
17430 case RecurKind::FAdd: {
17433 for (
Value *V : VL) {
17434 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17435 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17438 return Builder.
CreateFMul(VectorizedValue, Scale);
17440 case RecurKind::Mul:
17441 case RecurKind::FMul:
17442 case RecurKind::FMulAdd:
17443 case RecurKind::IAnyOf:
17444 case RecurKind::FAnyOf:
17445 case RecurKind::None:
17455 return HorizontalReduction::getRdxKind(V);
17458 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17459 return cast<FixedVectorType>(IE->getType())->getNumElements();
17461 unsigned AggregateSize = 1;
17462 auto *
IV = cast<InsertValueInst>(InsertInst);
17463 Type *CurrentType =
IV->getType();
17465 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
17466 for (
auto *Elt : ST->elements())
17467 if (Elt != ST->getElementType(0))
17468 return std::nullopt;
17469 AggregateSize *= ST->getNumElements();
17470 CurrentType = ST->getElementType(0);
17471 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17472 AggregateSize *= AT->getNumElements();
17473 CurrentType = AT->getElementType();
17474 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17475 AggregateSize *= VT->getNumElements();
17476 return AggregateSize;
17478 return AggregateSize;
17480 return std::nullopt;
17489 unsigned OperandOffset) {
17492 std::optional<unsigned> OperandIndex =
17496 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17498 BuildVectorOpds, InsertElts, *OperandIndex);
17501 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17502 InsertElts[*OperandIndex] = LastInsertInst;
17504 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
17505 }
while (LastInsertInst !=
nullptr &&
17506 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17529 assert((isa<InsertElementInst>(LastInsertInst) ||
17530 isa<InsertValueInst>(LastInsertInst)) &&
17531 "Expected insertelement or insertvalue instruction!");
17534 "Expected empty result vectors!");
17537 if (!AggregateSize)
17539 BuildVectorOpds.
resize(*AggregateSize);
17540 InsertElts.
resize(*AggregateSize);
17545 if (BuildVectorOpds.
size() >= 2)
17563 auto DominatedReduxValue = [&](
Value *R) {
17564 return isa<Instruction>(R) &&
17565 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
17571 if (
P->getIncomingBlock(0) == ParentBB) {
17572 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17573 }
else if (
P->getIncomingBlock(1) == ParentBB) {
17574 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17577 if (Rdx && DominatedReduxValue(Rdx))
17590 if (
P->getIncomingBlock(0) == BBLatch) {
17591 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17592 }
else if (
P->getIncomingBlock(1) == BBLatch) {
17593 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17596 if (Rdx && DominatedReduxValue(Rdx))
17630 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17631 isa<IntrinsicInst>(Root)) &&
17632 "Expected binop, select, or intrinsic for reduction matching");
17634 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17636 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17638 return dyn_cast<Instruction>(
RHS);
17640 return dyn_cast<Instruction>(
LHS);
17647 Value *Op0 =
nullptr;
17648 Value *Op1 =
nullptr;
17651 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17657 Value *B0 =
nullptr, *B1 =
nullptr;
17662bool SLPVectorizerPass::vectorizeHorReduction(
17667 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
17669 if (Root->
getParent() != BB || isa<PHINode>(Root))
17673 auto SelectRoot = [&]() {
17692 std::queue<std::pair<Instruction *, unsigned>>
Stack;
17693 Stack.emplace(SelectRoot(), 0);
17697 if (
R.isAnalyzedReductionRoot(Inst))
17702 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
17704 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
17706 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
17707 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17714 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17719 while (!
Stack.empty()) {
17722 std::tie(Inst, Level) =
Stack.front();
17727 if (
R.isDeleted(Inst))
17729 if (
Value *VectorizedV = TryToReduce(Inst)) {
17731 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
17733 Stack.emplace(
I, Level);
17738 if (!TryAppendToPostponedInsts(Inst)) {
17749 if (VisitedInstrs.
insert(
Op).second)
17750 if (
auto *
I = dyn_cast<Instruction>(
Op))
17753 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
17754 !
R.isDeleted(
I) &&
I->getParent() == BB)
17755 Stack.emplace(
I, Level);
17764 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
17765 Res |= tryToVectorize(PostponedInsts, R);
17772 for (
Value *V : Insts)
17773 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
17774 Res |= tryToVectorize(Inst, R);
17778bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
17780 if (!
R.canMapToVector(IVI->
getType()))
17788 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
17790 return tryToVectorizeList(BuildVectorOpds, R);
17799 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17803 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
17804 return tryToVectorizeList(BuildVectorInsts, R);
17807template <
typename T>
17812 bool MaxVFOnly,
BoUpSLP &R) {
17813 bool Changed =
false;
17822 auto *SameTypeIt = IncIt;
17823 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17827 unsigned NumElts = (SameTypeIt - IncIt);
17828 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
17829 << NumElts <<
")\n");
17840 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17846 auto GetMinNumElements = [&R](
Value *V) {
17847 unsigned EltSize = R.getVectorElementSize(V);
17848 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17850 if (NumElts < GetMinNumElements(*IncIt) &&
17851 (Candidates.
empty() ||
17852 Candidates.
front()->getType() == (*IncIt)->getType())) {
17853 Candidates.
append(IncIt, std::next(IncIt, NumElts));
17857 if (Candidates.
size() > 1 &&
17858 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17859 if (TryToVectorizeHelper(Candidates,
false)) {
17862 }
else if (MaxVFOnly) {
17864 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
17866 auto *SameTypeIt = It;
17867 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
17869 unsigned NumElts = (SameTypeIt - It);
17870 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
17876 Candidates.
clear();
17880 IncIt = SameTypeIt;
17892template <
bool IsCompatibility>
17897 "Expected valid element types only.");
17899 return IsCompatibility;
17900 auto *CI1 = cast<CmpInst>(V);
17901 auto *CI2 = cast<CmpInst>(V2);
17902 if (CI1->getOperand(0)->getType()->getTypeID() <
17904 return !IsCompatibility;
17905 if (CI1->getOperand(0)->getType()->getTypeID() >
17914 if (BasePred1 < BasePred2)
17915 return !IsCompatibility;
17916 if (BasePred1 > BasePred2)
17919 bool CI1Preds = Pred1 == BasePred1;
17920 bool CI2Preds = Pred2 == BasePred1;
17921 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
17922 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
17923 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
17927 return !IsCompatibility;
17930 if (
auto *I1 = dyn_cast<Instruction>(Op1))
17931 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
17932 if (IsCompatibility) {
17933 if (I1->getParent() != I2->getParent())
17940 return NodeI2 !=
nullptr;
17943 assert((NodeI1 == NodeI2) ==
17945 "Different nodes should have different DFS numbers");
17946 if (NodeI1 != NodeI2)
17950 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17952 if (IsCompatibility)
17954 if (I1->getOpcode() != I2->getOpcode())
17955 return I1->getOpcode() < I2->getOpcode();
17958 return IsCompatibility;
17961template <
typename ItT>
17964 bool Changed =
false;
17967 if (
R.isDeleted(
I))
17970 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
17971 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
17975 if (
R.isDeleted(
I))
17977 Changed |= tryToVectorize(
I, R);
17984 return compareCmp<false>(V, V2, *TLI, *DT);
17987 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
17990 return compareCmp<true>(V1, V2, *TLI, *DT);
17997 if (Vals.
size() <= 1)
17999 Changed |= tryToVectorizeSequence<Value>(
18000 Vals, CompareSorter, AreCompatibleCompares,
18003 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
18005 auto *Select = dyn_cast<SelectInst>(U);
18007 Select->getParent() != cast<Instruction>(V)->getParent();
18010 if (ArePossiblyReducedInOtherBlock)
18012 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18018bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18020 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18021 "This function only accepts Insert instructions");
18022 bool OpsChanged =
false;
18025 for (
auto *
I :
reverse(Instructions)) {
18026 if (
R.isDeleted(
I))
18028 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
18031 for (
auto *
I :
reverse(Instructions)) {
18032 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
18034 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
18035 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18036 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
18037 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18041 OpsChanged |= tryToVectorize(PostponedInsts, R);
18048 bool Changed =
false;
18055 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
18058 "Expected vectorizable types only.");
18067 if (Opcodes1.
size() < Opcodes2.
size())
18069 if (Opcodes1.
size() > Opcodes2.
size())
18071 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18074 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
18075 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
18080 return NodeI2 !=
nullptr;
18083 assert((NodeI1 == NodeI2) ==
18085 "Different nodes should have different DFS numbers");
18086 if (NodeI1 != NodeI2)
18089 if (S.getOpcode() && !S.isAltShuffle())
18091 return I1->getOpcode() < I2->getOpcode();
18100 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
18101 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
18109 bool U1 = isa<UndefValue>(Opcodes1[
I]);
18110 bool U2 = isa<UndefValue>(Opcodes2[
I]);
18114 auto ValID1 = Opcodes1[
I]->getValueID();
18115 auto ValID2 = Opcodes2[
I]->getValueID();
18116 if (ValID1 == ValID2)
18118 if (ValID1 < ValID2)
18120 if (ValID1 > ValID2)
18129 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
18134 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
18137 if (V1->getType() !=
V2->getType())
18141 if (Opcodes1.
size() != Opcodes2.
size())
18143 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18145 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
18147 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
18148 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
18149 if (
I1->getParent() != I2->getParent())
18156 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
18158 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
18164 bool HaveVectorizedPhiNodes =
false;
18175 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
18188 if (!Opcodes.
empty())
18192 while (!Nodes.empty()) {
18193 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
18196 for (
Value *V :
PHI->incoming_values()) {
18197 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
18198 Nodes.push_back(PHI1);
18206 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18207 Incoming, PHICompare, AreCompatiblePHIs,
18209 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18212 Changed |= HaveVectorizedPhiNodes;
18214 }
while (HaveVectorizedPhiNodes);
18216 VisitedInstrs.
clear();
18218 InstSetVector PostProcessInserts;
18222 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
18223 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18224 if (VectorizeCmps) {
18225 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
18226 PostProcessCmps.
clear();
18228 PostProcessInserts.clear();
18233 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
18234 return PostProcessCmps.
contains(Cmp);
18235 return isa<InsertElementInst, InsertValueInst>(
I) &&
18236 PostProcessInserts.contains(
I);
18242 return I->use_empty() &&
18243 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
18248 if (isa<ScalableVectorType>(It->getType()))
18252 if (
R.isDeleted(&*It))
18255 if (!VisitedInstrs.
insert(&*It).second) {
18256 if (HasNoUsers(&*It) &&
18257 VectorizeInsertsAndCmps(It->isTerminator())) {
18267 if (isa<DbgInfoIntrinsic>(It))
18271 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
18273 if (
P->getNumIncomingValues() == 2) {
18276 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
18285 for (
unsigned I = 0, E =
P->getNumIncomingValues();
I != E;
I++) {
18290 if (BB ==
P->getIncomingBlock(
I) ||
18291 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
18296 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
18297 PI && !IsInPostProcessInstrs(PI))
18298 Changed |= vectorizeRootInstruction(
nullptr, PI,
18299 P->getIncomingBlock(
I), R,
TTI);
18304 if (HasNoUsers(&*It)) {
18305 bool OpsChanged =
false;
18306 auto *
SI = dyn_cast<StoreInst>(It);
18316 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
18317 SI->getValueOperand()->hasOneUse();
18319 if (TryToVectorizeRoot) {
18320 for (
auto *V : It->operand_values()) {
18323 if (
auto *VI = dyn_cast<Instruction>(V);
18324 VI && !IsInPostProcessInstrs(VI))
18326 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
18333 VectorizeInsertsAndCmps(It->isTerminator());
18344 if (isa<InsertElementInst, InsertValueInst>(It))
18345 PostProcessInserts.insert(&*It);
18346 else if (isa<CmpInst>(It))
18347 PostProcessCmps.
insert(cast<CmpInst>(&*It));
18354 auto Changed =
false;
18355 for (
auto &Entry : GEPs) {
18358 if (Entry.second.size() < 2)
18361 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
18362 << Entry.second.size() <<
".\n");
18369 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18370 unsigned EltSize =
R.getVectorElementSize(*Entry.second[0]->idx_begin());
18371 if (MaxVecRegSize < EltSize)
18374 unsigned MaxElts = MaxVecRegSize / EltSize;
18375 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18376 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18389 Candidates.remove_if([&R](
Value *
I) {
18390 return R.isDeleted(cast<Instruction>(
I)) ||
18391 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
18399 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
18400 auto *GEPI = GEPList[
I];
18401 if (!Candidates.count(GEPI))
18403 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
18404 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
18405 auto *GEPJ = GEPList[J];
18406 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
18407 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
18408 Candidates.remove(GEPI);
18409 Candidates.remove(GEPJ);
18410 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18411 Candidates.remove(GEPJ);
18418 if (Candidates.
size() < 2)
18425 auto BundleIndex = 0
u;
18426 for (
auto *V : Candidates) {
18427 auto *
GEP = cast<GetElementPtrInst>(V);
18428 auto *GEPIdx =
GEP->idx_begin()->get();
18429 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18430 Bundle[BundleIndex++] = GEPIdx;
18442 Changed |= tryToVectorizeList(Bundle, R);
18448bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
18449 bool Changed =
false;
18454 if (
V->getValueOperand()->getType()->getTypeID() <
18455 V2->getValueOperand()->getType()->getTypeID())
18457 if (
V->getValueOperand()->getType()->getTypeID() >
18458 V2->getValueOperand()->getType()->getTypeID())
18460 if (
V->getPointerOperandType()->getTypeID() <
18461 V2->getPointerOperandType()->getTypeID())
18463 if (
V->getPointerOperandType()->getTypeID() >
18464 V2->getPointerOperandType()->getTypeID())
18467 if (isa<UndefValue>(
V->getValueOperand()) ||
18468 isa<UndefValue>(
V2->getValueOperand()))
18470 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
18471 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18473 DT->getNode(
I1->getParent());
18475 DT->getNode(I2->getParent());
18476 assert(NodeI1 &&
"Should only process reachable instructions");
18477 assert(NodeI2 &&
"Should only process reachable instructions");
18478 assert((NodeI1 == NodeI2) ==
18480 "Different nodes should have different DFS numbers");
18481 if (NodeI1 != NodeI2)
18486 return I1->getOpcode() < I2->getOpcode();
18488 if (isa<Constant>(
V->getValueOperand()) &&
18489 isa<Constant>(
V2->getValueOperand()))
18491 return V->getValueOperand()->getValueID() <
18492 V2->getValueOperand()->getValueID();
18504 isa<UndefValue>(
V2->getValueOperand()))
18507 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18508 if (
I1->getParent() != I2->getParent())
18511 return S.getOpcode() > 0;
18514 isa<Constant>(
V2->getValueOperand()))
18517 V2->getValueOperand()->getValueID();
18522 for (
auto &Pair : Stores) {
18523 if (Pair.second.size() < 2)
18527 << Pair.second.size() <<
".\n");
18536 Pair.second.rend());
18537 Changed |= tryToVectorizeSequence<StoreInst>(
18538 ReversedStores, StoreSorter, AreCompatibleStores,
18540 return vectorizeStores(Candidates, R, Attempted);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
const BasicBlock * getParent() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const