73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
128 cl::desc(
"Attempt to vectorize horizontal reductions"));
133 "Attempt to vectorize horizontal reductions feeding into a store"));
139 cl::desc(
"Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
144 cl::desc(
"Attempt to vectorize for this register size in bits"));
148 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
156 cl::desc(
"Limit the size of the SLP scheduling region per block"));
160 cl::desc(
"Attempt to vectorize for this register size in bits"));
164 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
168 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
174 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
183 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
187 cl::desc(
"The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
192 cl::desc(
"The maximum stride, considered to be profitable."));
196 cl::desc(
"Display the SLP trees with Graphviz"));
200 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
227 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
244 auto *
I = dyn_cast<Instruction>(V);
245 if (!
I || isa<ExtractValueInst>(
I))
247 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
249 if (isa<ExtractElementInst>(
I))
251 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
260 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
276 for (
int I = 1, E = VL.
size();
I < E;
I++) {
277 auto *II = dyn_cast<Instruction>(VL[
I]);
298 Value *FirstNonUndef =
nullptr;
299 for (
Value *V : VL) {
300 if (isa<UndefValue>(V))
302 if (!FirstNonUndef) {
306 if (V != FirstNonUndef)
309 return FirstNonUndef !=
nullptr;
314 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
315 return Cmp->isCommutative();
316 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
337 (BO->getOpcode() == Instruction::FSub &&
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
343 return I->isCommutative();
351 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
358 if (CI->getValue().uge(VT->getNumElements()))
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
365 const auto *
IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType =
IV->getType();
367 for (
unsigned I :
IV->indices()) {
368 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(
I);
371 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
406 if (MaskArg == UseMask::UndefsAsMask)
410 if (MaskArg == UseMask::FirstArg &&
Value < VF)
411 UseMask.reset(
Value);
412 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
413 UseMask.reset(
Value - VF);
421template <
bool IsPoisonOnly = false>
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
428 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
431 auto *
C = dyn_cast<Constant>(V);
433 if (!UseMask.empty()) {
435 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
444 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
452 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
459 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
460 if (
Constant *Elem =
C->getAggregateElement(
I))
462 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 =
nullptr;
501 Value *Vec2 =
nullptr;
503 ShuffleMode CommonShuffleMode =
Unknown;
505 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
507 if (isa<UndefValue>(VL[
I]))
509 auto *EI = cast<ExtractElementInst>(VL[
I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
512 auto *Vec = EI->getVectorOperand();
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
519 if (isa<UndefValue>(EI->getIndexOperand()))
521 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
527 unsigned IntIdx =
Idx->getValue().getZExtValue();
531 if (!Vec1 || Vec1 == Vec) {
533 }
else if (!Vec2 || Vec2 == Vec) {
539 if (CommonShuffleMode == Permute)
544 CommonShuffleMode = Permute;
547 CommonShuffleMode =
Select;
550 if (CommonShuffleMode ==
Select && Vec2)
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
568 return CI->getZExtValue();
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
573 return *EI->idx_begin();
579struct InstructionsState {
581 Value *OpValue =
nullptr;
592 unsigned getAltOpcode()
const {
597 bool isAltShuffle()
const {
return AltOp != MainOp; }
600 unsigned CheckedOpcode =
I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
604 InstructionsState() =
delete;
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
615 auto *
I = dyn_cast<Instruction>(
Op);
616 if (
I && S.isOpcodeOrAlt(
I))
635 unsigned BaseIndex = 0);
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
656 "Assessing comparisons of different types?");
666 return (BasePred == Pred &&
668 (BasePred == SwappedPred &&
677 unsigned BaseIndex) {
680 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
692 bool SwappedPredsCompatible = [&]() {
696 UniquePreds.
insert(BasePred);
697 UniqueNonSwappedPreds.
insert(BasePred);
698 for (
Value *V : VL) {
699 auto *
I = dyn_cast<CmpInst>(V);
705 UniqueNonSwappedPreds.
insert(CurrentPred);
706 if (!UniquePreds.
contains(CurrentPred) &&
707 !UniquePreds.
contains(SwappedCurrentPred))
708 UniquePreds.
insert(CurrentPred);
713 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
720 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
724 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
726 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
727 auto *
I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode =
I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(
I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
734 AltOpcode = InstOpcode;
738 }
else if (IsCastOp && isa<CastInst>(
I)) {
739 Value *Op0 = IBase->getOperand(0);
741 Value *Op1 =
I->getOperand(0);
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
746 if (Opcode == AltOpcode) {
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
755 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
760 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
777 }
else if (BasePred != CurrentPred) {
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
789 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
794 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
796 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
797 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
801 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
802 auto *
CallBase = cast<CallInst>(IBase);
804 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
810 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
813 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
816 if (Mappings.
size() != BaseMappings.
size() ||
817 Mappings.
front().ISA != BaseMappings.
front().ISA ||
818 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
819 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
820 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
821 Mappings.
front().Shape.Parameters !=
822 BaseMappings.
front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
828 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
873 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
880 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
881 return LI->isSimple();
883 return SI->isSimple();
885 return !
MI->isVolatile();
893 bool ExtendingManyInputs =
false) {
897 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
899 (SubMask.
size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
904 Mask.append(SubMask.
begin(), SubMask.
end());
908 int TermValue = std::min(Mask.size(), SubMask.
size());
909 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
911 (!ExtendingManyInputs &&
912 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
914 NewMask[
I] = Mask[SubMask[
I]];
930 const unsigned Sz = Order.
size();
933 for (
unsigned I = 0;
I < Sz; ++
I) {
935 UnusedIndices.
reset(Order[
I]);
937 MaskedIndices.
set(
I);
939 if (MaskedIndices.
none())
942 "Non-synced masked/available indices.");
946 assert(
Idx >= 0 &&
"Indices must be synced.");
958 const unsigned E = Indices.
size();
960 for (
unsigned I = 0;
I < E; ++
I)
961 Mask[Indices[
I]] =
I;
967 assert(!Mask.empty() &&
"Expected non-empty mask.");
971 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
973 Scalars[Mask[
I]] = Prev[
I];
981 auto *
I = dyn_cast<Instruction>(V);
986 auto *IO = dyn_cast<Instruction>(V);
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
998 auto *
I = dyn_cast<Instruction>(V);
1002 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1004 auto *IU = dyn_cast<Instruction>(U);
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1023 return !VL.
empty() &&
1027namespace slpvectorizer {
1032 struct ScheduleData;
1057 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1112 return !VectorizableTree.
empty() &&
1113 !VectorizableTree.
front()->UserTreeIndices.empty();
1118 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1119 return VectorizableTree.
front()->Scalars;
1134 VectorizableTree.
clear();
1135 ScalarToTreeEntry.clear();
1136 MultiNodeScalars.clear();
1138 NonScheduledFirst.
clear();
1139 EntryToLastInstruction.clear();
1140 ExternalUses.
clear();
1141 ExternalUsesAsGEPs.clear();
1142 for (
auto &Iter : BlocksSchedules) {
1143 BlockScheduling *BS = Iter.second.get();
1147 ReductionBitWidth = 0;
1148 CastMaxMinBWSizes.reset();
1149 ExtraBitWidthNodes.
clear();
1150 InstrElementSize.clear();
1151 UserIgnoreList =
nullptr;
1152 PostponedGathers.
clear();
1153 ValueToGatherNodes.
clear();
1210 return MaxVecRegSize;
1215 return MinVecRegSize;
1223 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1225 return MaxVF ? MaxVF : UINT_MAX;
1269 bool TryRecursiveCheck =
true)
const;
1293 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1294 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1316 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1317 MaxLevel(MaxLevel) {}
1371 if (isa<LoadInst>(V1)) {
1373 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1378 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1380 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1383 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1386 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1388 ((
int)V1->getNumUses() == NumLanes ||
1389 AllUsersAreInternal(V1, V2)))
1395 auto *LI1 = dyn_cast<LoadInst>(V1);
1396 auto *LI2 = dyn_cast<LoadInst>(V2);
1398 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1403 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1404 LI2->getPointerOperand(),
DL, SE,
true);
1405 if (!Dist || *Dist == 0) {
1408 R.TTI->isLegalMaskedGather(
1416 if (std::abs(*Dist) > NumLanes / 2)
1425 auto *C1 = dyn_cast<Constant>(V1);
1426 auto *C2 = dyn_cast<Constant>(V2);
1440 if (isa<UndefValue>(V2))
1444 Value *EV2 =
nullptr;
1457 int Dist = Idx2 - Idx1;
1460 if (std::abs(Dist) == 0)
1462 if (std::abs(Dist) > NumLanes / 2)
1472 auto *I1 = dyn_cast<Instruction>(V1);
1473 auto *I2 = dyn_cast<Instruction>(V2);
1475 if (I1->getParent() != I2->getParent())
1483 if (S.getOpcode() &&
1484 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1485 !S.isAltShuffle()) &&
1487 return cast<Instruction>(V)->getNumOperands() ==
1488 S.MainOp->getNumOperands();
1494 if (isa<UndefValue>(V2))
1531 int ShallowScoreAtThisLevel =
1540 auto *I1 = dyn_cast<Instruction>(
LHS);
1541 auto *I2 = dyn_cast<Instruction>(
RHS);
1542 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1544 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1545 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1546 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1547 ShallowScoreAtThisLevel))
1548 return ShallowScoreAtThisLevel;
1549 assert(I1 && I2 &&
"Should have early exited.");
1556 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1557 OpIdx1 != NumOperands1; ++OpIdx1) {
1559 int MaxTmpScore = 0;
1560 unsigned MaxOpIdx2 = 0;
1561 bool FoundBest =
false;
1565 ? I2->getNumOperands()
1566 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1567 assert(FromIdx <= ToIdx &&
"Bad index");
1568 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1570 if (Op2Used.
count(OpIdx2))
1575 I1, I2, CurrLevel + 1, std::nullopt);
1578 TmpScore > MaxTmpScore) {
1579 MaxTmpScore = TmpScore;
1586 Op2Used.
insert(MaxOpIdx2);
1587 ShallowScoreAtThisLevel += MaxTmpScore;
1590 return ShallowScoreAtThisLevel;
1621 struct OperandData {
1622 OperandData() =
default;
1623 OperandData(
Value *V,
bool APO,
bool IsUsed)
1624 : V(V), APO(APO), IsUsed(IsUsed) {}
1634 bool IsUsed =
false;
1643 enum class ReorderingMode {
1662 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1663 return OpsVec[OpIdx][Lane];
1667 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1668 return OpsVec[OpIdx][Lane];
1673 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1674 OpIdx != NumOperands; ++OpIdx)
1675 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1677 OpsVec[OpIdx][Lane].IsUsed =
false;
1681 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1682 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1694 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1695 Value *IdxLaneV = getData(
Idx, Lane).V;
1696 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1699 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1702 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1703 if (!isa<Instruction>(OpIdxLnV))
1705 Uniques.
insert(OpIdxLnV);
1707 int UniquesCount = Uniques.
size();
1708 int UniquesCntWithIdxLaneV =
1709 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1710 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1711 int UniquesCntWithOpIdxLaneV =
1712 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1713 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1716 UniquesCntWithOpIdxLaneV) -
1717 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1726 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1727 Value *IdxLaneV = getData(
Idx, Lane).V;
1728 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1737 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1738 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1740 return R.areAllUsersVectorized(IdxLaneI)
1748 static const int ScoreScaleFactor = 10;
1756 int Lane,
unsigned OpIdx,
unsigned Idx,
1766 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1767 if (Score <= -SplatScore) {
1772 Score += SplatScore;
1778 Score *= ScoreScaleFactor;
1779 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1797 std::optional<unsigned>
1798 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1801 unsigned NumOperands = getNumOperands();
1804 Value *OpLastLane = getData(OpIdx, LastLane).V;
1807 ReorderingMode RMode = ReorderingModes[OpIdx];
1808 if (RMode == ReorderingMode::Failed)
1809 return std::nullopt;
1812 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1818 std::optional<unsigned>
Idx;
1822 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1829 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1831 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1833 OperandData &OpData = getData(
Idx, Lane);
1835 bool OpAPO = OpData.APO;
1844 if (OpAPO != OpIdxAPO)
1849 case ReorderingMode::Load:
1850 case ReorderingMode::Constant:
1851 case ReorderingMode::Opcode: {
1852 bool LeftToRight = Lane > LastLane;
1853 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1854 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1855 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1856 OpIdx,
Idx, IsUsed);
1857 if (Score >
static_cast<int>(BestOp.Score)) {
1859 BestOp.Score = Score;
1860 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1864 case ReorderingMode::Splat:
1865 if (
Op == OpLastLane)
1868 case ReorderingMode::Failed:
1874 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1878 return std::nullopt;
1885 unsigned getBestLaneToStartReordering()
const {
1886 unsigned Min = UINT_MAX;
1887 unsigned SameOpNumber = 0;
1898 for (
int I = getNumLanes();
I > 0; --
I) {
1899 unsigned Lane =
I - 1;
1900 OperandsOrderData NumFreeOpsHash =
1901 getMaxNumOperandsThatCanBeReordered(Lane);
1904 if (NumFreeOpsHash.NumOfAPOs < Min) {
1905 Min = NumFreeOpsHash.NumOfAPOs;
1906 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1908 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1909 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1910 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1913 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1914 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1915 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1916 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1917 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1918 if (It == HashMap.
end())
1919 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1925 unsigned BestLane = 0;
1926 unsigned CntMin = UINT_MAX;
1928 if (
Data.second.first < CntMin) {
1929 CntMin =
Data.second.first;
1930 BestLane =
Data.second.second;
1937 struct OperandsOrderData {
1940 unsigned NumOfAPOs = UINT_MAX;
1943 unsigned NumOpsWithSameOpcodeParent = 0;
1957 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1958 unsigned CntTrue = 0;
1959 unsigned NumOperands = getNumOperands();
1969 bool AllUndefs =
true;
1970 unsigned NumOpsWithSameOpcodeParent = 0;
1974 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1975 const OperandData &OpData = getData(OpIdx, Lane);
1980 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1982 I->getParent() != Parent) {
1983 if (NumOpsWithSameOpcodeParent == 0) {
1984 NumOpsWithSameOpcodeParent = 1;
1986 Parent =
I->getParent();
1988 --NumOpsWithSameOpcodeParent;
1991 ++NumOpsWithSameOpcodeParent;
1995 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1996 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2000 OperandsOrderData
Data;
2001 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2002 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2010 assert((empty() || VL.
size() == getNumLanes()) &&
2011 "Expected same number of lanes");
2012 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2013 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2014 constexpr unsigned IntrinsicNumOperands = 2;
2015 if (isa<IntrinsicInst>(VL[0]))
2016 NumOperands = IntrinsicNumOperands;
2017 OpsVec.
resize(NumOperands);
2018 unsigned NumLanes = VL.
size();
2019 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2020 OpsVec[OpIdx].
resize(NumLanes);
2021 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2022 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2033 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2034 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2035 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2042 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2045 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2048 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2049 return getData(OpIdx, Lane).V;
2053 bool empty()
const {
return OpsVec.
empty(); }
2056 void clear() { OpsVec.
clear(); }
2061 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2062 bool OpAPO = getData(OpIdx, Lane).APO;
2063 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2067 bool FoundCandidate =
false;
2068 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2069 OperandData &
Data = getData(OpI, Ln);
2070 if (
Data.APO != OpAPO ||
Data.IsUsed)
2073 FoundCandidate =
true;
2078 if (!FoundCandidate)
2087 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R) {
2089 appendOperandsOfVL(RootVL);
2096 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2097 "Expected same num of lanes across all operands");
2098 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2099 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2107 unsigned NumOperands = getNumOperands();
2108 unsigned NumLanes = getNumLanes();
2128 unsigned FirstLane = getBestLaneToStartReordering();
2131 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2132 Value *OpLane0 = getValue(OpIdx, FirstLane);
2135 if (isa<LoadInst>(OpLane0))
2136 ReorderingModes[OpIdx] = ReorderingMode::Load;
2137 else if (isa<Instruction>(OpLane0)) {
2139 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2140 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2142 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2144 else if (isa<Constant>(OpLane0))
2145 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2146 else if (isa<Argument>(OpLane0))
2148 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2151 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2158 auto &&SkipReordering = [
this]() {
2161 for (
const OperandData &
Data : Op0)
2164 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2183 if (SkipReordering())
2186 bool StrategyFailed =
false;
2194 for (
unsigned I = 0;
I < NumOperands; ++
I)
2195 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2197 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2200 int Lane = FirstLane +
Direction * Distance;
2201 if (Lane < 0 || Lane >= (
int)NumLanes)
2204 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2207 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2209 std::optional<unsigned> BestIdx = getBestOperand(
2210 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2217 swap(OpIdx, *BestIdx, Lane);
2220 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2222 StrategyFailed =
true;
2225 if (MainAltOps[OpIdx].
size() != 2) {
2226 OperandData &AltOp = getData(OpIdx, Lane);
2227 InstructionsState OpS =
2229 if (OpS.getOpcode() && OpS.isAltShuffle())
2236 if (!StrategyFailed)
2241#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2244 case ReorderingMode::Load:
2246 case ReorderingMode::Opcode:
2248 case ReorderingMode::Constant:
2250 case ReorderingMode::Splat:
2252 case ReorderingMode::Failed:
2273 const unsigned Indent = 2;
2276 OS <<
"Operand " << Cnt++ <<
"\n";
2277 for (
const OperandData &OpData : OpDataVec) {
2279 if (
Value *V = OpData.V)
2283 OS <<
", APO:" << OpData.APO <<
"}\n";
2305 int BestScore = Limit;
2306 std::optional<int>
Index;
2307 for (
int I : seq<int>(0, Candidates.size())) {
2309 Candidates[
I].second,
2312 if (Score > BestScore) {
2327 DeletedInstructions.insert(
I);
2333 return AnalyzedReductionsRoots.count(
I);
2338 AnalyzedReductionsRoots.insert(
I);
2352 AnalyzedReductionsRoots.clear();
2353 AnalyzedReductionVals.
clear();
2354 AnalyzedMinBWVals.
clear();
2366 return NonScheduledFirst.
contains(V);
2379 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2383 unsigned &MaxDepthLevel,
2384 bool &IsProfitableToDemote,
2385 bool IsTruncRoot)
const;
2395 canReorderOperands(TreeEntry *UserTE,
2402 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2406 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2408 TreeEntry *TE =
nullptr;
2410 TE = getTreeEntry(V);
2411 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2413 auto It = MultiNodeScalars.find(V);
2414 if (It != MultiNodeScalars.end()) {
2415 for (TreeEntry *E : It->second) {
2416 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2424 if (It != VL.
end()) {
2425 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2433 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2434 unsigned OpIdx)
const {
2435 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2436 const_cast<TreeEntry *
>(UserTE), OpIdx);
2440 bool areAllUsersVectorized(
2449 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2453 getCastContextHint(
const TreeEntry &TE)
const;
2462 const EdgeInfo &EI);
2473 bool ResizeAllowed =
false)
const;
2484 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2489 template <
typename BVTy,
typename ResTy,
typename...
Args>
2490 ResTy processBuildVector(
const TreeEntry *E, Args &...Params);
2495 Value *createBuildVector(
const TreeEntry *E);
2501 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2508 std::optional<TargetTransformInfo::ShuffleKind>
2520 unsigned NumParts)
const;
2532 std::optional<TargetTransformInfo::ShuffleKind>
2533 isGatherShuffledSingleRegisterEntry(
2550 isGatherShuffledEntry(
2553 unsigned NumParts,
bool ForOrder =
false);
2563 void setInsertPointAfterBundle(
const TreeEntry *E);
2571 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2584 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2600 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2604 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2621 [Scalars](
Value *V,
int Idx) {
2622 return (isa<UndefValue>(V) &&
2623 Idx == PoisonMaskElem) ||
2624 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2627 if (!ReorderIndices.empty()) {
2634 return IsSame(Scalars, Mask);
2635 if (VL.
size() == ReuseShuffleIndices.size()) {
2637 return IsSame(Scalars, Mask);
2641 return IsSame(Scalars, ReuseShuffleIndices);
2644 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2645 return State == TreeEntry::NeedToGather &&
2646 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2647 UserTreeIndices.front().UserTE == UserEI.UserTE;
2651 bool hasEqualOperands(
const TreeEntry &TE)
const {
2652 if (
TE.getNumOperands() != getNumOperands())
2655 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2656 unsigned PrevCount =
Used.count();
2657 for (
unsigned K = 0;
K < E; ++
K) {
2660 if (getOperand(K) ==
TE.getOperand(
I)) {
2666 if (PrevCount ==
Used.count())
2675 unsigned getVectorFactor()
const {
2676 if (!ReuseShuffleIndices.empty())
2677 return ReuseShuffleIndices.size();
2678 return Scalars.
size();
2713 VecTreeTy &Container;
2737 assert(Operands[OpIdx].empty() &&
"Already resized?");
2739 "Number of operands is greater than the number of scalars.");
2745 void setOperandsInOrder() {
2747 auto *I0 = cast<Instruction>(Scalars[0]);
2748 Operands.resize(I0->getNumOperands());
2749 unsigned NumLanes = Scalars.size();
2750 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2751 OpIdx != NumOperands; ++OpIdx) {
2753 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2754 auto *
I = cast<Instruction>(Scalars[Lane]);
2755 assert(
I->getNumOperands() == NumOperands &&
2756 "Expected same number of operands");
2757 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2781 unsigned getNumOperands()
const {
return Operands.size(); }
2784 Value *getSingleOperand(
unsigned OpIdx)
const {
2786 assert(!Operands[OpIdx].empty() &&
"No operand available");
2791 bool isAltShuffle()
const {
return MainOp != AltOp; }
2794 unsigned CheckedOpcode =
I->getOpcode();
2795 return (getOpcode() == CheckedOpcode ||
2796 getAltOpcode() == CheckedOpcode);
2803 auto *
I = dyn_cast<Instruction>(
Op);
2804 if (
I && isOpcodeOrAlt(
I))
2809 void setOperations(
const InstructionsState &S) {
2823 unsigned getOpcode()
const {
2824 return MainOp ? MainOp->
getOpcode() : 0;
2827 unsigned getAltOpcode()
const {
2833 int findLaneForValue(
Value *V)
const {
2834 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2835 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2836 if (!ReorderIndices.
empty())
2837 FoundLane = ReorderIndices[FoundLane];
2838 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2839 if (!ReuseShuffleIndices.
empty()) {
2840 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2841 find(ReuseShuffleIndices, FoundLane));
2855 bool isNonPowOf2Vec()
const {
2857 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
2858 "Reshuffling not supported with non-power-of-2 vectors yet.");
2859 return IsNonPowerOf2;
2866 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2867 dbgs() <<
"Operand " << OpI <<
":\n";
2868 for (
const Value *V : Operands[OpI])
2871 dbgs() <<
"Scalars: \n";
2872 for (
Value *V : Scalars)
2874 dbgs() <<
"State: ";
2877 dbgs() <<
"Vectorize\n";
2879 case ScatterVectorize:
2880 dbgs() <<
"ScatterVectorize\n";
2882 case StridedVectorize:
2883 dbgs() <<
"StridedVectorize\n";
2886 dbgs() <<
"NeedToGather\n";
2889 dbgs() <<
"MainOp: ";
2891 dbgs() << *MainOp <<
"\n";
2894 dbgs() <<
"AltOp: ";
2896 dbgs() << *AltOp <<
"\n";
2899 dbgs() <<
"VectorizedValue: ";
2900 if (VectorizedValue)
2901 dbgs() << *VectorizedValue <<
"\n";
2904 dbgs() <<
"ReuseShuffleIndices: ";
2905 if (ReuseShuffleIndices.
empty())
2908 for (
int ReuseIdx : ReuseShuffleIndices)
2909 dbgs() << ReuseIdx <<
", ";
2911 dbgs() <<
"ReorderIndices: ";
2912 for (
unsigned ReorderIdx : ReorderIndices)
2913 dbgs() << ReorderIdx <<
", ";
2915 dbgs() <<
"UserTreeIndices: ";
2916 for (
const auto &EInfo : UserTreeIndices)
2917 dbgs() << EInfo <<
", ";
2924 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
2927 dbgs() <<
"SLP: " << Banner <<
":\n";
2929 dbgs() <<
"SLP: Costs:\n";
2930 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2931 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2932 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2933 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2934 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2940 std::optional<ScheduleData *> Bundle,
2941 const InstructionsState &S,
2942 const EdgeInfo &UserTreeIdx,
2945 TreeEntry::EntryState EntryState =
2946 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2947 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2948 ReuseShuffleIndices, ReorderIndices);
2952 TreeEntry::EntryState EntryState,
2953 std::optional<ScheduleData *> Bundle,
2954 const InstructionsState &S,
2955 const EdgeInfo &UserTreeIdx,
2958 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2959 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2960 "Need to vectorize gather entry?");
2961 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2962 TreeEntry *
Last = VectorizableTree.
back().get();
2963 Last->Idx = VectorizableTree.
size() - 1;
2964 Last->State = EntryState;
2965 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2966 ReuseShuffleIndices.end());
2967 if (ReorderIndices.
empty()) {
2969 Last->setOperations(S);
2972 Last->Scalars.assign(VL.
size(),
nullptr);
2975 if (Idx >= VL.size())
2976 return UndefValue::get(VL.front()->getType());
2980 Last->setOperations(S);
2981 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2983 if (
Last->State != TreeEntry::NeedToGather) {
2984 for (
Value *V : VL) {
2985 const TreeEntry *
TE = getTreeEntry(V);
2987 "Scalar already in tree!");
2990 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
2993 ScalarToTreeEntry[
V] =
Last;
2996 ScheduleData *BundleMember = *Bundle;
2997 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3000 "Bundle and VL out of sync");
3002 for (
Value *V : VL) {
3007 BundleMember->TE =
Last;
3008 BundleMember = BundleMember->NextInBundle;
3011 assert(!BundleMember &&
"Bundle and VL out of sync");
3014 bool AllConstsOrCasts =
true;
3017 auto *
I = dyn_cast<CastInst>(V);
3018 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3021 if (AllConstsOrCasts)
3023 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3024 MustGather.
insert(VL.begin(), VL.end());
3027 if (UserTreeIdx.UserTE) {
3028 Last->UserTreeIndices.push_back(UserTreeIdx);
3029 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3030 "Reordering isn't implemented for non-power-of-2 nodes yet");
3037 TreeEntry::VecTreeTy VectorizableTree;
3042 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3043 VectorizableTree[
Id]->dump();
3049 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3051 const TreeEntry *getTreeEntry(
Value *V)
const {
3052 return ScalarToTreeEntry.lookup(V);
3061 bool areAltOperandsProfitable(
const InstructionsState &S,
3066 TreeEntry::EntryState getScalarsVectorizationState(
3099 using ValueToGatherNodesMap =
3101 ValueToGatherNodesMap ValueToGatherNodes;
3104 struct ExternalUser {
3128 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3129 auto It = AliasCache.
find(Key);
3130 if (It != AliasCache.
end())
3135 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3139 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3171 UserList ExternalUses;
3191 struct ScheduleData {
3194 enum { InvalidDeps = -1 };
3196 ScheduleData() =
default;
3198 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3199 FirstInBundle =
this;
3200 NextInBundle =
nullptr;
3201 NextLoadStore =
nullptr;
3202 IsScheduled =
false;
3203 SchedulingRegionID = BlockSchedulingRegionID;
3204 clearDependencies();
3211 if (hasValidDependencies()) {
3212 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3214 assert(UnscheduledDeps == Dependencies &&
"invariant");
3218 assert(isSchedulingEntity() &&
3219 "unexpected scheduled state");
3220 for (
const ScheduleData *BundleMember =
this; BundleMember;
3221 BundleMember = BundleMember->NextInBundle) {
3222 assert(BundleMember->hasValidDependencies() &&
3223 BundleMember->UnscheduledDeps == 0 &&
3224 "unexpected scheduled state");
3225 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3226 "only bundle is marked scheduled");
3230 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3231 "all bundle members must be in same basic block");
3237 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3241 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3245 bool isPartOfBundle()
const {
3246 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3251 bool isReady()
const {
3252 assert(isSchedulingEntity() &&
3253 "can't consider non-scheduling entity for ready list");
3254 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3260 int incrementUnscheduledDeps(
int Incr) {
3261 assert(hasValidDependencies() &&
3262 "increment of unscheduled deps would be meaningless");
3263 UnscheduledDeps += Incr;
3264 return FirstInBundle->unscheduledDepsInBundle();
3269 void resetUnscheduledDeps() {
3270 UnscheduledDeps = Dependencies;
3274 void clearDependencies() {
3275 Dependencies = InvalidDeps;
3276 resetUnscheduledDeps();
3277 MemoryDependencies.clear();
3278 ControlDependencies.clear();
3281 int unscheduledDepsInBundle()
const {
3282 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3284 for (
const ScheduleData *BundleMember =
this; BundleMember;
3285 BundleMember = BundleMember->NextInBundle) {
3286 if (BundleMember->UnscheduledDeps == InvalidDeps)
3288 Sum += BundleMember->UnscheduledDeps;
3294 if (!isSchedulingEntity()) {
3295 os <<
"/ " << *Inst;
3296 }
else if (NextInBundle) {
3298 ScheduleData *SD = NextInBundle;
3300 os <<
';' << *SD->Inst;
3301 SD = SD->NextInBundle;
3312 Value *OpValue =
nullptr;
3315 TreeEntry *
TE =
nullptr;
3319 ScheduleData *FirstInBundle =
nullptr;
3323 ScheduleData *NextInBundle =
nullptr;
3327 ScheduleData *NextLoadStore =
nullptr;
3341 int SchedulingRegionID = 0;
3344 int SchedulingPriority = 0;
3350 int Dependencies = InvalidDeps;
3356 int UnscheduledDeps = InvalidDeps;
3360 bool IsScheduled =
false;
3365 const BoUpSLP::ScheduleData &SD) {
3390 struct BlockScheduling {
3392 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3396 ScheduleStart =
nullptr;
3397 ScheduleEnd =
nullptr;
3398 FirstLoadStoreInRegion =
nullptr;
3399 LastLoadStoreInRegion =
nullptr;
3400 RegionHasStackSave =
false;
3404 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3407 ScheduleRegionSize = 0;
3411 ++SchedulingRegionID;
3415 if (BB !=
I->getParent())
3418 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3419 if (SD && isInSchedulingRegion(SD))
3424 ScheduleData *getScheduleData(
Value *V) {
3425 if (
auto *
I = dyn_cast<Instruction>(V))
3426 return getScheduleData(
I);
3430 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3432 return getScheduleData(V);
3433 auto I = ExtraScheduleDataMap.find(V);
3434 if (
I != ExtraScheduleDataMap.end()) {
3435 ScheduleData *SD =
I->second.lookup(Key);
3436 if (SD && isInSchedulingRegion(SD))
3442 bool isInSchedulingRegion(ScheduleData *SD)
const {
3443 return SD->SchedulingRegionID == SchedulingRegionID;
3448 template <
typename ReadyListType>
3449 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3450 SD->IsScheduled =
true;
3453 for (ScheduleData *BundleMember = SD; BundleMember;
3454 BundleMember = BundleMember->NextInBundle) {
3455 if (BundleMember->Inst != BundleMember->OpValue)
3461 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3462 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3463 if (OpDef && OpDef->hasValidDependencies() &&
3464 OpDef->incrementUnscheduledDeps(-1) == 0) {
3468 ScheduleData *DepBundle = OpDef->FirstInBundle;
3469 assert(!DepBundle->IsScheduled &&
3470 "already scheduled bundle gets ready");
3471 ReadyList.insert(DepBundle);
3473 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3481 if (TreeEntry *TE = BundleMember->TE) {
3483 int Lane = std::distance(
TE->Scalars.begin(),
3484 find(
TE->Scalars, BundleMember->Inst));
3485 assert(Lane >= 0 &&
"Lane not set");
3493 auto *
In = BundleMember->Inst;
3496 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3497 In->getNumOperands() ==
TE->getNumOperands()) &&
3498 "Missed TreeEntry operands?");
3501 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3502 OpIdx != NumOperands; ++OpIdx)
3503 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3508 for (
Use &U : BundleMember->Inst->operands())
3509 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3513 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3514 if (MemoryDepSD->hasValidDependencies() &&
3515 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3518 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3519 assert(!DepBundle->IsScheduled &&
3520 "already scheduled bundle gets ready");
3521 ReadyList.insert(DepBundle);
3523 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3527 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3528 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3531 ScheduleData *DepBundle = DepSD->FirstInBundle;
3532 assert(!DepBundle->IsScheduled &&
3533 "already scheduled bundle gets ready");
3534 ReadyList.insert(DepBundle);
3536 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3547 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3548 ScheduleStart->comesBefore(ScheduleEnd) &&
3549 "Not a valid scheduling region?");
3551 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3552 auto *SD = getScheduleData(
I);
3555 assert(isInSchedulingRegion(SD) &&
3556 "primary schedule data not in window?");
3557 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3558 "entire bundle in window!");
3560 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3563 for (
auto *SD : ReadyInsts) {
3564 assert(SD->isSchedulingEntity() && SD->isReady() &&
3565 "item in ready list not ready?");
3570 void doForAllOpcodes(
Value *V,
3572 if (ScheduleData *SD = getScheduleData(V))
3574 auto I = ExtraScheduleDataMap.find(V);
3575 if (
I != ExtraScheduleDataMap.end())
3576 for (
auto &
P :
I->second)
3577 if (isInSchedulingRegion(
P.second))
3582 template <
typename ReadyListType>
3583 void initialFillReadyList(ReadyListType &ReadyList) {
3584 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3585 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3586 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3588 ReadyList.insert(SD);
3590 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3605 std::optional<ScheduleData *>
3607 const InstructionsState &S);
3613 ScheduleData *allocateScheduleDataChunks();
3617 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3622 ScheduleData *PrevLoadStore,
3623 ScheduleData *NextLoadStore);
3627 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3631 void resetSchedule();
3652 ExtraScheduleDataMap;
3665 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3669 ScheduleData *LastLoadStoreInRegion =
nullptr;
3674 bool RegionHasStackSave =
false;
3677 int ScheduleRegionSize = 0;
3686 int SchedulingRegionID = 1;
3694 void scheduleBlock(BlockScheduling *BS);
3701 struct OrdersTypeDenseMapInfo {
3714 static unsigned getHashValue(
const OrdersType &V) {
3735 unsigned MaxVecRegSize;
3736 unsigned MinVecRegSize;
3751 unsigned ReductionBitWidth = 0;
3755 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3774 struct ChildIteratorType
3776 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3787 return R.VectorizableTree[0].get();
3791 return {
N->UserTreeIndices.begin(),
N->Container};
3795 return {
N->UserTreeIndices.end(),
N->Container};
3800 class nodes_iterator {
3811 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3815 return nodes_iterator(R->VectorizableTree.begin());
3819 return nodes_iterator(R->VectorizableTree.end());
3822 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3833 OS << Entry->Idx <<
".\n";
3836 for (
auto *V : Entry->Scalars) {
3838 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3839 return EU.Scalar == V;
3849 if (Entry->State == TreeEntry::NeedToGather)
3851 if (Entry->State == TreeEntry::ScatterVectorize ||
3852 Entry->State == TreeEntry::StridedVectorize)
3853 return "color=blue";
3862 for (
auto *
I : DeletedInstructions) {
3863 for (
Use &U :
I->operands()) {
3864 auto *
Op = dyn_cast<Instruction>(U.get());
3865 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3869 I->dropAllReferences();
3871 for (
auto *
I : DeletedInstructions) {
3873 "trying to erase instruction with users.");
3874 I->eraseFromParent();
3880#ifdef EXPENSIVE_CHECKS
3891 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3892 "Expected non-empty mask.");
3895 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3897 Reuses[Mask[
I]] = Prev[
I];
3905 bool BottomOrder =
false) {
3906 assert(!Mask.empty() &&
"Expected non-empty mask.");
3907 unsigned Sz = Mask.size();
3910 if (Order.
empty()) {
3912 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
3914 PrevOrder.
swap(Order);
3917 for (
unsigned I = 0;
I < Sz; ++
I)
3919 Order[
I] = PrevOrder[Mask[
I]];
3921 return Data.value() == Sz ||
Data.index() ==
Data.value();
3930 if (Order.
empty()) {
3932 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3942 for (
unsigned I = 0;
I < Sz; ++
I)
3944 Order[MaskOrder[
I]] =
I;
3948std::optional<BoUpSLP::OrdersType>
3950 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3954 Type *ScalarTy = GatheredScalars.
front()->getType();
3955 int NumScalars = GatheredScalars.
size();
3957 return std::nullopt;
3960 if (NumParts == 0 || NumParts >= NumScalars)
3966 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3968 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3971 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
3972 return std::nullopt;
3973 OrdersType CurrentOrder(NumScalars, NumScalars);
3974 if (GatherShuffles.
size() == 1 &&
3976 Entries.front().front()->isSame(TE.Scalars)) {
3979 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
3980 return CurrentOrder;
3984 return all_of(Mask, [&](
int I) {
3991 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
3992 (Entries.size() != 1 ||
3993 Entries.front().front()->ReorderIndices.empty())) ||
3994 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
3995 return std::nullopt;
4000 for (
int I : seq<int>(0, NumParts)) {
4001 if (ShuffledSubMasks.
test(
I))
4003 const int VF = GetVF(
I);
4008 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4009 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4010 ShuffledSubMasks.
set(
I);
4014 int FirstMin = INT_MAX;
4015 int SecondVecFound =
false;
4016 for (
int K : seq<int>(0, PartSz)) {
4017 int Idx = Mask[
I * PartSz + K];
4019 Value *V = GatheredScalars[
I * PartSz + K];
4021 SecondVecFound =
true;
4030 SecondVecFound =
true;
4034 FirstMin = (FirstMin / PartSz) * PartSz;
4036 if (SecondVecFound) {
4037 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4038 ShuffledSubMasks.
set(
I);
4041 for (
int K : seq<int>(0, PartSz)) {
4042 int Idx = Mask[
I * PartSz + K];
4046 if (
Idx >= PartSz) {
4047 SecondVecFound =
true;
4050 if (CurrentOrder[
I * PartSz +
Idx] >
4051 static_cast<unsigned>(
I * PartSz + K) &&
4052 CurrentOrder[
I * PartSz +
Idx] !=
4053 static_cast<unsigned>(
I * PartSz +
Idx))
4054 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4057 if (SecondVecFound) {
4058 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4059 ShuffledSubMasks.
set(
I);
4064 int PartSz = NumScalars / NumParts;
4065 if (!ExtractShuffles.
empty())
4066 TransformMaskToOrder(
4067 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4068 if (!ExtractShuffles[
I])
4071 for (
unsigned Idx : seq<unsigned>(0, PartSz)) {
4072 int K =
I * PartSz +
Idx;
4075 if (!TE.ReuseShuffleIndices.empty())
4076 K = TE.ReuseShuffleIndices[K];
4077 if (!TE.ReorderIndices.empty())
4078 K = std::distance(TE.ReorderIndices.begin(),
4079 find(TE.ReorderIndices, K));
4080 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4083 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4085 .getKnownMinValue());
4090 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4091 if (ShuffledSubMasks.
any())
4092 return std::nullopt;
4093 PartSz = NumScalars;
4096 if (!Entries.empty())
4097 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4098 if (!GatherShuffles[
I])
4100 return std::max(Entries[
I].front()->getVectorFactor(),
4101 Entries[
I].back()->getVectorFactor());
4104 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4105 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4106 return std::nullopt;
4107 return std::move(CurrentOrder);
4112 bool CompareOpcodes =
true) {
4115 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4118 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4121 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4125 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4130template <
typename T>
4132 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4134 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4135 return CommonAlignment;
4140 unsigned Sz = Order.
size();
4142 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4153static std::optional<Value *>
4159 const SCEV *PtrSCEVLowest =
nullptr;
4160 const SCEV *PtrSCEVHighest =
nullptr;
4166 return std::nullopt;
4168 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4169 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4173 if (isa<SCEVCouldNotCompute>(Diff))
4174 return std::nullopt;
4176 PtrSCEVLowest = PtrSCEV;
4180 if (isa<SCEVCouldNotCompute>(Diff1))
4181 return std::nullopt;
4183 PtrSCEVHighest = PtrSCEV;
4189 if (isa<SCEVCouldNotCompute>(Dist))
4190 return std::nullopt;
4191 int Size =
DL.getTypeStoreSize(ElemTy);
4192 auto TryGetStride = [&](
const SCEV *Dist,
4193 const SCEV *Multiplier) ->
const SCEV * {
4194 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4195 if (M->getOperand(0) == Multiplier)
4196 return M->getOperand(1);
4197 if (M->getOperand(1) == Multiplier)
4198 return M->getOperand(0);
4201 if (Multiplier == Dist)
4206 const SCEV *Stride =
nullptr;
4207 if (
Size != 1 || SCEVs.
size() > 2) {
4209 Stride = TryGetStride(Dist, Sz);
4211 return std::nullopt;
4213 if (!Stride || isa<SCEVConstant>(Stride))
4214 return std::nullopt;
4217 using DistOrdPair = std::pair<int64_t, int>;
4219 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4221 bool IsConsecutive =
true;
4222 for (
const SCEV *PtrSCEV : SCEVs) {
4224 if (PtrSCEV != PtrSCEVLowest) {
4226 const SCEV *Coeff = TryGetStride(Diff, Stride);
4228 return std::nullopt;
4229 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4230 if (!SC || isa<SCEVCouldNotCompute>(SC))
4231 return std::nullopt;
4235 return std::nullopt;
4236 Dist = SC->getAPInt().getZExtValue();
4240 return std::nullopt;
4241 auto Res = Offsets.emplace(Dist, Cnt);
4243 return std::nullopt;
4245 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4248 if (Offsets.size() != SCEVs.
size())
4249 return std::nullopt;
4250 SortedIndices.
clear();
4251 if (!IsConsecutive) {
4255 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4256 SortedIndices[Cnt] = Pair.second;
4277 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4283 const unsigned Sz = VL.
size();
4285 auto *POIter = PointerOps.
begin();
4286 for (
Value *V : VL) {
4287 auto *L = cast<LoadInst>(V);
4290 *POIter = L->getPointerOperand();
4301 "supported with VectorizeNonPowerOf2");
4305 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4316 if (Order.
empty()) {
4317 Ptr0 = PointerOps.
front();
4318 PtrN = PointerOps.
back();
4320 Ptr0 = PointerOps[Order.
front()];
4321 PtrN = PointerOps[Order.
back()];
4323 std::optional<int> Diff =
4326 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4329 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4341 (
static_cast<unsigned>(std::abs(*Diff)) <=
4344 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4345 *Diff == -(
static_cast<int>(Sz) - 1))) {
4346 int Stride = *Diff /
static_cast<int>(Sz - 1);
4347 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4359 else if (
Ptr != Ptr0)
4364 if (((Dist / Stride) * Stride) != Dist ||
4365 !Dists.
insert(Dist).second)
4368 if (Dists.
size() == Sz)
4374 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4375 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4377 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4378 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4379 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4380 unsigned VectorizedCnt = 0;
4382 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4383 Cnt += VF, ++VectorizedCnt) {
4401 if (VectorizedCnt == VL.
size() / VF) {
4405 Instruction::Load, VecTy,
4411 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4415 Instruction::Load, SubVecTy, LI0->getAlign(),
4416 LI0->getPointerAddressSpace(),
CostKind,
4421 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4426 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4431 "Expected only consecutive, strided or masked gather loads.");
4434 for (
int Idx : seq<int>(0, VL.
size()))
4438 ShuffleMask,
CostKind,
I * VF, SubVecTy);
4443 if (MaskedGatherCost > VecLdCost)
4453 bool ProfitableGatherPointers =
4456 return L->isLoopInvariant(V);
4458 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4459 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4461 (
GEP &&
GEP->getNumOperands() == 2 &&
4462 isa<Constant, Instruction>(
GEP->getOperand(1)));
4464 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4469 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4488 "Expected list of pointer operands.");
4493 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4498 std::optional<int> Diff =
4504 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4510 if (Bases.
size() > VL.
size() / 2 - 1)
4514 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4520 bool AnyConsecutive =
false;
4521 for (
auto &
Base : Bases) {
4522 auto &Vec =
Base.second;
4523 if (Vec.size() > 1) {
4525 const std::tuple<Value *, int, unsigned> &
Y) {
4526 return std::get<1>(
X) < std::get<1>(
Y);
4528 int InitialOffset = std::get<1>(Vec[0]);
4530 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4536 SortedIndices.
clear();
4537 if (!AnyConsecutive)
4540 for (
auto &
Base : Bases) {
4541 for (
auto &
T :
Base.second)
4546 "Expected SortedIndices to be the size of VL");
4550std::optional<BoUpSLP::OrdersType>
4552 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4553 Type *ScalarTy = TE.Scalars[0]->getType();
4556 Ptrs.
reserve(TE.Scalars.size());
4557 for (
Value *V : TE.Scalars) {
4558 auto *L = dyn_cast<LoadInst>(V);
4559 if (!L || !L->isSimple())
4560 return std::nullopt;
4566 return std::move(Order);
4567 return std::nullopt;
4578 if (VU->
getType() != V->getType())
4581 if (!VU->
hasOneUse() && !V->hasOneUse())
4587 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4593 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4594 bool IsReusedIdx =
false;
4596 if (IE2 == VU && !IE1)
4598 if (IE1 == V && !IE2)
4599 return V->hasOneUse();
4600 if (IE1 && IE1 != V) {
4602 IsReusedIdx |= ReusedIdx.
test(Idx1);
4603 ReusedIdx.
set(Idx1);
4604 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4607 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4609 if (IE2 && IE2 != VU) {
4611 IsReusedIdx |= ReusedIdx.
test(Idx2);
4612 ReusedIdx.
set(Idx2);
4613 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4616 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4618 }
while (!IsReusedIdx && (IE1 || IE2));
4622std::optional<BoUpSLP::OrdersType>
4625 if (TE.isNonPowOf2Vec())
4626 return std::nullopt;
4630 if (!TE.ReuseShuffleIndices.empty()) {
4632 return std::nullopt;
4640 unsigned Sz = TE.Scalars.size();
4641 if (TE.State == TreeEntry::NeedToGather) {
4642 if (std::optional<OrdersType> CurrentOrder =
4647 ::addMask(Mask, TE.ReuseShuffleIndices);
4648 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4649 unsigned Sz = TE.Scalars.size();
4650 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4653 Res[
Idx + K * Sz] =
I + K * Sz;
4655 return std::move(Res);
4658 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4660 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4661 return std::nullopt;
4665 if (TE.ReorderIndices.empty())
4666 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4669 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4670 unsigned VF = ReorderMask.
size();
4672 unsigned NumParts = VF / Sz;
4674 for (
unsigned I = 0;
I < VF;
I += Sz) {
4676 unsigned UndefCnt = 0;
4685 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4687 return std::nullopt;
4689 for (
unsigned K = 0; K < NumParts; ++K)
4690 ResOrder[Val + Sz * K] =
I + K;
4692 return std::move(ResOrder);
4694 unsigned VF = TE.getVectorFactor();
4697 TE.ReuseShuffleIndices.end());
4698 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4700 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4701 return Idx && *Idx < Sz;
4704 if (TE.ReorderIndices.empty())
4705 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4708 for (
unsigned I = 0;
I < VF; ++
I) {
4709 int &
Idx = ReusedMask[
I];
4712 Value *V = TE.Scalars[ReorderMask[
Idx]];
4714 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4720 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4721 auto *It = ResOrder.
begin();
4722 for (
unsigned K = 0; K < VF; K += Sz) {
4726 std::iota(SubMask.begin(), SubMask.end(), 0);
4728 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4729 std::advance(It, Sz);
4731 if (TE.State == TreeEntry::NeedToGather &&
4733 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4734 return std::nullopt;
4735 return std::move(ResOrder);
4737 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4738 any_of(TE.UserTreeIndices,
4740 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4742 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4743 return std::nullopt;
4744 if ((TE.State == TreeEntry::Vectorize ||
4745 TE.State == TreeEntry::StridedVectorize) &&
4746 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4747 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4749 return TE.ReorderIndices;
4750 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4751 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4752 Value *V1 = TE.Scalars[I1];
4753 Value *V2 = TE.Scalars[I2];
4754 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4760 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4761 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4762 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4763 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4770 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4771 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4772 if (EE1->getOperand(0) != EE2->getOperand(0))
4778 auto IsIdentityOrder = [](
const OrdersType &Order) {
4779 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4784 if (!TE.ReorderIndices.empty())
4785 return TE.ReorderIndices;
4788 std::iota(Phis.begin(), Phis.end(), 0);
4790 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4793 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4794 ResOrder[Id] = PhiToId[Phis[Id]];
4795 if (IsIdentityOrder(ResOrder))
4796 return std::nullopt;
4797 return std::move(ResOrder);
4799 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4803 if ((TE.getOpcode() == Instruction::ExtractElement ||
4804 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4805 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4807 auto *EE = dyn_cast<ExtractElementInst>(V);
4808 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4813 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4815 if (Reuse || !CurrentOrder.
empty())
4816 return std::move(CurrentOrder);
4824 int Sz = TE.Scalars.size();
4826 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4828 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4829 if (It == TE.Scalars.begin())
4832 if (It != TE.Scalars.end()) {
4834 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4849 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4852 return std::move(Order);
4857 return std::nullopt;
4858 if (TE.Scalars.size() >= 4)
4862 return CurrentOrder;
4864 return std::nullopt;
4874 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4876 if (Cluster != FirstCluster)
4882void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4885 const unsigned Sz =
TE.Scalars.size();
4887 if (
TE.State != TreeEntry::NeedToGather ||
4894 addMask(NewMask,
TE.ReuseShuffleIndices);
4896 TE.ReorderIndices.clear();
4903 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4904 *
End =
TE.ReuseShuffleIndices.end();
4905 It !=
End; std::advance(It, Sz))
4906 std::iota(It, std::next(It, Sz), 0);
4912 "Expected same size of orders");
4913 unsigned Sz = Order.
size();
4915 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
4916 if (Order[
Idx] != Sz)
4917 UsedIndices.
set(Order[
Idx]);
4919 if (SecondaryOrder.
empty()) {
4920 for (
unsigned Idx : seq<unsigned>(0, Sz))
4921 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
4924 for (
unsigned Idx : seq<unsigned>(0, Sz))
4925 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
4926 !UsedIndices.
test(SecondaryOrder[
Idx]))
4927 Order[
Idx] = SecondaryOrder[
Idx];
4947 ExternalUserReorderMap;
4952 const std::unique_ptr<TreeEntry> &TE) {
4955 findExternalStoreUsersReorderIndices(TE.get());
4956 if (!ExternalUserReorderIndices.
empty()) {
4957 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4959 std::move(ExternalUserReorderIndices));
4965 if (TE->isAltShuffle()) {
4968 unsigned Opcode0 = TE->getOpcode();
4969 unsigned Opcode1 = TE->getAltOpcode();
4972 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4973 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4974 OpcodeMask.
set(Lane);
4976 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4977 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4983 if (std::optional<OrdersType> CurrentOrder =
4993 const TreeEntry *UserTE = TE.get();
4995 if (UserTE->UserTreeIndices.size() != 1)
4998 return EI.UserTE->State == TreeEntry::Vectorize &&
4999 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5002 UserTE = UserTE->UserTreeIndices.back().UserTE;
5005 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5006 if (!(TE->State == TreeEntry::Vectorize ||
5007 TE->State == TreeEntry::StridedVectorize) ||
5008 !TE->ReuseShuffleIndices.empty())
5009 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5010 if (TE->State == TreeEntry::Vectorize &&
5011 TE->getOpcode() == Instruction::PHI)
5012 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5017 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5019 auto It = VFToOrderedEntries.
find(VF);
5020 if (It == VFToOrderedEntries.
end())
5032 for (
const TreeEntry *OpTE : OrderedEntries) {
5035 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5038 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5040 if (OpTE->State == TreeEntry::NeedToGather ||
5041 !OpTE->ReuseShuffleIndices.empty()) {
5042 auto It = GathersToOrders.find(OpTE);
5043 if (It != GathersToOrders.end())
5046 if (OpTE->isAltShuffle()) {
5047 auto It = AltShufflesToOrders.find(OpTE);
5048 if (It != AltShufflesToOrders.end())
5051 if (OpTE->State == TreeEntry::Vectorize &&
5052 OpTE->getOpcode() == Instruction::PHI) {
5053 auto It = PhisToOrders.
find(OpTE);
5054 if (It != PhisToOrders.
end())
5057 return OpTE->ReorderIndices;
5060 auto It = ExternalUserReorderMap.
find(OpTE);
5061 if (It != ExternalUserReorderMap.
end()) {
5062 const auto &ExternalUserReorderIndices = It->second;
5066 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5067 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5068 ExternalUserReorderIndices.size();
5070 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5071 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5078 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5079 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5082 unsigned E = Order.size();
5085 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5088 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5090 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5093 if (OrdersUses.empty())
5096 const unsigned Sz = Order.size();
5097 for (
unsigned Idx : seq<unsigned>(0, Sz))
5098 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5103 unsigned IdentityCnt = 0;
5104 unsigned FilledIdentityCnt = 0;
5106 for (
auto &Pair : OrdersUses) {
5107 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5108 if (!Pair.first.empty())
5109 FilledIdentityCnt += Pair.second;
5110 IdentityCnt += Pair.second;
5115 unsigned Cnt = IdentityCnt;
5116 for (
auto &Pair : OrdersUses) {
5120 if (Cnt < Pair.second ||
5121 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5122 Cnt == Pair.second && !BestOrder.
empty() &&
5123 IsIdentityOrder(BestOrder))) {
5125 BestOrder = Pair.first;
5132 if (IsIdentityOrder(BestOrder))
5138 unsigned E = BestOrder.
size();
5140 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5143 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5145 if (TE->Scalars.size() != VF) {
5146 if (TE->ReuseShuffleIndices.size() == VF) {
5152 return EI.UserTE->Scalars.size() == VF ||
5153 EI.UserTE->Scalars.size() ==
5156 "All users must be of VF size.");
5159 reorderNodeWithReuses(*TE, Mask);
5163 if ((TE->State == TreeEntry::Vectorize ||
5164 TE->State == TreeEntry::StridedVectorize) &&
5167 !TE->isAltShuffle()) {
5171 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5172 TE->reorderOperands(Mask);
5175 TE->reorderOperands(Mask);
5176 assert(TE->ReorderIndices.empty() &&
5177 "Expected empty reorder sequence.");
5180 if (!TE->ReuseShuffleIndices.empty()) {
5187 addMask(NewReuses, TE->ReuseShuffleIndices);
5188 TE->ReuseShuffleIndices.swap(NewReuses);
5194bool BoUpSLP::canReorderOperands(
5195 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5199 if (UserTE->isNonPowOf2Vec())
5202 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5203 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5204 return OpData.first ==
I &&
5205 (OpData.second->State == TreeEntry::Vectorize ||
5206 OpData.second->State == TreeEntry::StridedVectorize);
5209 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5211 if (
any_of(TE->UserTreeIndices,
5212 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5216 Edges.emplace_back(
I, TE);
5222 if (TE->State != TreeEntry::Vectorize &&
5223 TE->State != TreeEntry::StridedVectorize &&
5224 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5228 TreeEntry *
Gather =
nullptr;
5230 [&
Gather, UserTE,
I](TreeEntry *TE) {
5231 assert(TE->State != TreeEntry::Vectorize &&
5232 TE->State != TreeEntry::StridedVectorize &&
5233 "Only non-vectorized nodes are expected.");
5234 if (
any_of(TE->UserTreeIndices,
5235 [UserTE,
I](
const EdgeInfo &EI) {
5236 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5238 assert(TE->isSame(UserTE->getOperand(
I)) &&
5239 "Operand entry does not match operands.");
5260 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5261 if (TE->State != TreeEntry::Vectorize &&
5262 TE->State != TreeEntry::StridedVectorize)
5264 if (std::optional<OrdersType> CurrentOrder =
5266 OrderedEntries.
insert(TE.get());
5267 if (!(TE->State == TreeEntry::Vectorize ||
5268 TE->State == TreeEntry::StridedVectorize) ||
5269 !TE->ReuseShuffleIndices.empty())
5270 GathersToOrders.
insert(TE.get());
5279 while (!OrderedEntries.
empty()) {
5284 for (TreeEntry *TE : OrderedEntries) {
5285 if (!(TE->State == TreeEntry::Vectorize ||
5286 TE->State == TreeEntry::StridedVectorize ||
5287 (TE->State == TreeEntry::NeedToGather &&
5289 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5292 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5294 !Visited.
insert(TE).second) {
5300 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5301 TreeEntry *UserTE = EI.
UserTE;
5302 auto It =
Users.find(UserTE);
5303 if (It ==
Users.end())
5304 It =
Users.insert({UserTE, {}}).first;
5305 It->second.emplace_back(EI.
EdgeIdx, TE);
5309 for (TreeEntry *TE : Filtered)
5310 OrderedEntries.remove(TE);
5312 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5314 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5315 return Data1.first->Idx > Data2.first->Idx;
5317 for (
auto &
Data : UsersVec) {
5320 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5322 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5323 OrderedEntries.remove(
Op.second);
5336 for (
const auto &
Op :
Data.second) {
5337 TreeEntry *OpTE =
Op.second;
5338 if (!VisitedOps.
insert(OpTE).second)
5340 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5342 const auto Order = [&]() ->
const OrdersType {
5343 if (OpTE->State == TreeEntry::NeedToGather ||
5344 !OpTE->ReuseShuffleIndices.empty())
5347 return OpTE->ReorderIndices;
5351 if (Order.size() == 1)
5354 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5355 return P.second == OpTE;
5358 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5359 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5362 unsigned E = Order.size();
5365 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5368 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5371 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5373 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5374 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5376 if (TE->isNonPowOf2Vec())
5378 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5379 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5380 (IgnoreReorder && TE->Idx == 0))
5382 if (TE->State == TreeEntry::NeedToGather) {
5391 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5392 TreeEntry *UserTE = EI.
UserTE;
5393 if (!VisitedUsers.
insert(UserTE).second)
5398 if (AllowsReordering(UserTE))
5406 if (
static_cast<unsigned>(
count_if(
5407 Ops, [UserTE, &AllowsReordering](
5408 const std::pair<unsigned, TreeEntry *> &
Op) {
5409 return AllowsReordering(
Op.second) &&
5412 return EI.UserTE == UserTE;
5414 })) <= Ops.
size() / 2)
5415 ++Res.first->second;
5418 if (OrdersUses.empty()) {
5419 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5420 OrderedEntries.remove(
Op.second);
5424 const unsigned Sz = Order.size();
5425 for (
unsigned Idx : seq<unsigned>(0, Sz))
5426 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5431 unsigned IdentityCnt = 0;
5432 unsigned VF =
Data.second.front().second->getVectorFactor();
5434 for (
auto &Pair : OrdersUses) {
5435 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5436 IdentityCnt += Pair.second;
5441 unsigned Cnt = IdentityCnt;
5442 for (
auto &Pair : OrdersUses) {
5446 if (Cnt < Pair.second) {
5448 BestOrder = Pair.first;
5455 if (IsIdentityOrder(BestOrder)) {
5456 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5457 OrderedEntries.remove(
Op.second);
5466 unsigned E = BestOrder.
size();
5468 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5470 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5471 TreeEntry *TE =
Op.second;
5472 OrderedEntries.remove(TE);
5473 if (!VisitedOps.
insert(TE).second)
5475 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5476 reorderNodeWithReuses(*TE, Mask);
5480 if (TE->State != TreeEntry::Vectorize &&
5481 TE->State != TreeEntry::StridedVectorize &&
5482 (TE->State != TreeEntry::ScatterVectorize ||
5483 TE->ReorderIndices.empty()))
5485 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5486 TE->ReorderIndices.empty()) &&
5487 "Non-matching sizes of user/operand entries.");
5489 if (IgnoreReorder && TE == VectorizableTree.front().get())
5490 IgnoreReorder =
false;
5493 for (TreeEntry *
Gather : GatherOps) {
5495 "Unexpected reordering of gathers.");
5496 if (!
Gather->ReuseShuffleIndices.empty()) {
5502 OrderedEntries.remove(
Gather);
5506 if (
Data.first->State != TreeEntry::Vectorize ||
5507 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5508 Data.first->getMainOp()) ||
5509 Data.first->isAltShuffle())
5510 Data.first->reorderOperands(Mask);
5511 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5512 Data.first->isAltShuffle() ||
5513 Data.first->State == TreeEntry::StridedVectorize) {
5517 if (
Data.first->ReuseShuffleIndices.empty() &&
5518 !
Data.first->ReorderIndices.empty() &&
5519 !
Data.first->isAltShuffle()) {
5522 OrderedEntries.insert(
Data.first);
5530 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5531 VectorizableTree.front()->ReuseShuffleIndices.empty())
5532 VectorizableTree.front()->ReorderIndices.clear();
5539 for (
auto &TEPtr : VectorizableTree) {
5540 TreeEntry *Entry = TEPtr.get();
5543 if (Entry->State == TreeEntry::NeedToGather)
5547 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5548 Value *Scalar = Entry->Scalars[Lane];
5549 if (!isa<Instruction>(Scalar))
5552 auto It = ScalarToExtUses.
find(Scalar);
5553 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5557 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5558 if (ExtI != ExternallyUsedValues.
end()) {
5559 int FoundLane = Entry->findLaneForValue(Scalar);
5560 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5561 << FoundLane <<
" from " << *Scalar <<
".\n");
5562 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5563 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5566 for (
User *U : Scalar->users()) {
5574 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5578 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5582 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5584 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5585 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5587 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5591 if (It != ScalarToExtUses.
end()) {
5592 ExternalUses[It->second].User =
nullptr;
5597 int FoundLane = Entry->findLaneForValue(Scalar);
5599 <<
" from lane " << FoundLane <<
" from " << *Scalar
5601 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5602 ExternalUses.emplace_back(Scalar, U, FoundLane);
5611BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5613 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5614 Value *V = TE->Scalars[Lane];
5620 for (
User *U : V->users()) {
5621 auto *SI = dyn_cast<StoreInst>(U);
5622 if (SI ==
nullptr || !SI->isSimple() ||
5626 if (getTreeEntry(U))
5630 auto &StoresVec = PtrToStoresMap[
Ptr];
5633 if (StoresVec.size() > Lane)
5636 if (!StoresVec.empty() &&
5637 SI->getParent() != StoresVec.back()->getParent())
5640 if (!StoresVec.empty() &&
5641 SI->getValueOperand()->getType() !=
5642 StoresVec.back()->getValueOperand()->getType())
5644 StoresVec.push_back(SI);
5647 return PtrToStoresMap;
5651 OrdersType &ReorderIndices)
const {
5659 StoreOffsetVec[0] = {S0, 0};
5662 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5664 std::optional<int> Diff =
5666 SI->getPointerOperand(), *
DL, *SE,
5671 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5676 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5677 const std::pair<StoreInst *, int> &Pair2) {
5678 int Offset1 = Pair1.second;
5679 int Offset2 = Pair2.second;
5680 return Offset1 < Offset2;
5684 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5685 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5690 ReorderIndices.reserve(StoresVec.
size());
5693 [SI](
const std::pair<StoreInst *, int> &Pair) {
5694 return Pair.first ==
SI;
5696 StoreOffsetVec.begin();
5697 ReorderIndices.push_back(
Idx);
5702 auto IsIdentityOrder = [](
const OrdersType &Order) {
5703 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5708 if (IsIdentityOrder(ReorderIndices))
5709 ReorderIndices.clear();
5716 for (
unsigned Idx : Order)
5723BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5724 unsigned NumLanes =
TE->Scalars.size();
5727 collectUserStores(TE);
5736 for (
const auto &Pair : PtrToStoresMap) {
5737 auto &StoresVec = Pair.second;
5739 if (StoresVec.size() != NumLanes)
5744 if (!canFormVector(StoresVec, ReorderIndices))
5749 ExternalReorderIndices.
push_back(ReorderIndices);
5751 return ExternalReorderIndices;
5757 UserIgnoreList = &UserIgnoreLst;
5760 buildTree_rec(Roots, 0,
EdgeInfo());
5767 buildTree_rec(Roots, 0,
EdgeInfo());
5774 Value *NeedsScheduling =
nullptr;
5775 for (
Value *V : VL) {
5778 if (!NeedsScheduling) {
5779 NeedsScheduling = V;
5784 return NeedsScheduling;
5795 bool AllowAlternate) {
5799 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5802 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5807 if (isa<ExtractElementInst, UndefValue>(V))
5809 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5811 !isa<UndefValue>(EI->getIndexOperand()))
5814 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5817 if ((isa<BinaryOperator, CastInst>(
I)) &&
5827 : cast<CastInst>(
I)->getOperand(0)->getType()));
5829 if (isa<CastInst>(
I)) {
5830 std::pair<size_t, size_t> OpVals =
5836 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5838 if (CI->isCommutative())
5844 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5858 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5859 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5860 SubKey =
hash_value(Gep->getPointerOperand());
5864 !isa<ConstantInt>(
I->getOperand(1))) {
5872 return std::make_pair(Key, SubKey);
5882bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
5884 unsigned Opcode0 = S.getOpcode();
5885 unsigned Opcode1 = S.getAltOpcode();
5888 for (
unsigned Lane : seq<unsigned>(0, VL.
size()))
5889 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
5890 OpcodeMask.set(Lane);
5893 Opcode0, Opcode1, OpcodeMask))
5896 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5900 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
5904 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5910 switch (Res.value_or(0)) {
5925 constexpr unsigned NumAltInsts = 3;
5926 unsigned NonInstCnt = 0;
5929 unsigned UndefCnt = 0;
5931 unsigned ExtraShuffleInsts = 0;
5940 return is_contained(Operands.back(), V);
5943 ++ExtraShuffleInsts;
5960 if (isa<Constant, ExtractElementInst>(V) ||
5961 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
5962 if (isa<UndefValue>(V))
5968 if (!Res.second && Res.first->second == 1)
5969 ++ExtraShuffleInsts;
5970 ++Res.first->getSecond();
5971 if (
auto *
I = dyn_cast<Instruction>(V))
5972 UniqueOpcodes.
insert(
I->getOpcode());
5973 else if (Res.second)
5976 return none_of(Uniques, [&](
const auto &
P) {
5977 return P.first->hasNUsesOrMore(
P.second + 1) &&
5979 return getTreeEntry(U) || Uniques.contains(U);
5988 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5989 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
5990 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5993BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5996 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
5998 unsigned ShuffleOrOp =
5999 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
6000 auto *VL0 = cast<Instruction>(S.OpValue);
6001 switch (ShuffleOrOp) {
6002 case Instruction::PHI: {
6005 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
6007 if (Term &&
Term->isTerminator()) {
6009 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
6010 return TreeEntry::NeedToGather;
6014 return TreeEntry::Vectorize;
6016 case Instruction::ExtractValue:
6017 case Instruction::ExtractElement: {
6018 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6021 return TreeEntry::NeedToGather;
6022 if (Reuse || !CurrentOrder.empty())
6023 return TreeEntry::Vectorize;
6025 return TreeEntry::NeedToGather;
6027 case Instruction::InsertElement: {
6031 for (
Value *V : VL) {
6032 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6034 "Non-constant or undef index?");
6038 return !SourceVectors.contains(V);
6041 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6042 "different source vectors.\n");
6043 return TreeEntry::NeedToGather;
6046 return TreeEntry::Vectorize;
6048 case Instruction::Load: {
6057 return TreeEntry::Vectorize;
6059 return TreeEntry::ScatterVectorize;
6061 return TreeEntry::StridedVectorize;
6064 Type *ScalarTy = VL0->getType();
6065 if (
DL->getTypeSizeInBits(ScalarTy) !=
6066 DL->getTypeAllocSizeInBits(ScalarTy))
6067 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6069 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6074 return TreeEntry::NeedToGather;
6078 case Instruction::ZExt:
6079 case Instruction::SExt:
6080 case Instruction::FPToUI:
6081 case Instruction::FPToSI:
6082 case Instruction::FPExt:
6083 case Instruction::PtrToInt:
6084 case Instruction::IntToPtr:
6085 case Instruction::SIToFP:
6086 case Instruction::UIToFP:
6087 case Instruction::Trunc:
6088 case Instruction::FPTrunc:
6089 case Instruction::BitCast: {
6090 Type *SrcTy = VL0->getOperand(0)->getType();
6091 for (
Value *V : VL) {
6092 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6095 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6096 return TreeEntry::NeedToGather;
6099 return TreeEntry::Vectorize;
6101 case Instruction::ICmp:
6102 case Instruction::FCmp: {
6106 Type *ComparedTy = VL0->getOperand(0)->getType();
6107 for (
Value *V : VL) {
6109 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6110 Cmp->getOperand(0)->getType() != ComparedTy) {
6111 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6112 return TreeEntry::NeedToGather;
6115 return TreeEntry::Vectorize;
6117 case Instruction::Select:
6118 case Instruction::FNeg:
6119 case Instruction::Add:
6120 case Instruction::FAdd:
6121 case Instruction::Sub:
6122 case Instruction::FSub:
6123 case Instruction::Mul:
6124 case Instruction::FMul:
6125 case Instruction::UDiv:
6126 case Instruction::SDiv:
6127 case Instruction::FDiv:
6128 case Instruction::URem:
6129 case Instruction::SRem:
6130 case Instruction::FRem:
6131 case Instruction::Shl:
6132 case Instruction::LShr:
6133 case Instruction::AShr:
6134 case Instruction::And:
6135 case Instruction::Or:
6136 case Instruction::Xor:
6137 return TreeEntry::Vectorize;
6138 case Instruction::GetElementPtr: {
6140 for (
Value *V : VL) {
6141 auto *
I = dyn_cast<GetElementPtrInst>(V);
6144 if (
I->getNumOperands() != 2) {
6145 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6146 return TreeEntry::NeedToGather;
6152 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6153 for (
Value *V : VL) {
6154 auto *
GEP = dyn_cast<GEPOperator>(V);
6157 Type *CurTy =
GEP->getSourceElementType();
6159 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6160 return TreeEntry::NeedToGather;
6165 Type *Ty1 = VL0->getOperand(1)->getType();
6166 for (
Value *V : VL) {
6167 auto *
I = dyn_cast<GetElementPtrInst>(V);
6170 auto *
Op =
I->getOperand(1);
6171 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6172 (
Op->getType() != Ty1 &&
6173 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6174 Op->getType()->getScalarSizeInBits() >
6175 DL->getIndexSizeInBits(
6176 V->getType()->getPointerAddressSpace())))) {
6178 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6179 return TreeEntry::NeedToGather;
6183 return TreeEntry::Vectorize;
6185 case Instruction::Store: {
6187 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6190 if (
DL->getTypeSizeInBits(ScalarTy) !=
6191 DL->getTypeAllocSizeInBits(ScalarTy)) {
6192 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6193 return TreeEntry::NeedToGather;
6197 for (
Value *V : VL) {
6198 auto *
SI = cast<StoreInst>(V);
6199 if (!
SI->isSimple()) {
6201 return TreeEntry::NeedToGather;
6210 if (CurrentOrder.empty()) {
6211 Ptr0 = PointerOps.
front();
6212 PtrN = PointerOps.
back();
6214 Ptr0 = PointerOps[CurrentOrder.front()];
6215 PtrN = PointerOps[CurrentOrder.back()];
6217 std::optional<int> Dist =
6220 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6221 return TreeEntry::Vectorize;
6225 return TreeEntry::NeedToGather;
6227 case Instruction::Call: {
6230 CallInst *CI = cast<CallInst>(VL0);
6241 return TreeEntry::NeedToGather;
6246 for (
unsigned J = 0; J != NumArgs; ++J)
6249 for (
Value *V : VL) {
6250 CallInst *CI2 = dyn_cast<CallInst>(V);
6256 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6258 return TreeEntry::NeedToGather;
6262 for (
unsigned J = 0; J != NumArgs; ++J) {
6265 if (ScalarArgs[J] != A1J) {
6267 <<
"SLP: mismatched arguments in call:" << *CI
6268 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6269 return TreeEntry::NeedToGather;
6278 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6279 <<
"!=" << *V <<
'\n');
6280 return TreeEntry::NeedToGather;
6284 return TreeEntry::Vectorize;
6286 case Instruction::ShuffleVector: {
6289 if (!S.isAltShuffle()) {
6290 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6291 return TreeEntry::NeedToGather;
6296 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6297 "the whole alt sequence is not profitable.\n");
6298 return TreeEntry::NeedToGather;
6301 return TreeEntry::Vectorize;
6305 return TreeEntry::NeedToGather;
6310 const EdgeInfo &UserTreeIdx) {
6316 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6317 bool DoNotFail =
false) {
6320 for (
Value *V : VL) {
6327 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6332 size_t NumUniqueScalarValues = UniqueValues.
size();
6333 if (NumUniqueScalarValues == VL.size()) {
6334 ReuseShuffleIndicies.
clear();
6337 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6338 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6339 "for nodes with padding.\n");
6340 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6344 if (NumUniqueScalarValues <= 1 ||
6345 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6347 return isa<UndefValue>(V) ||
6350 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6351 if (DoNotFail && UniquePositions.size() > 1 &&
6352 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6354 return isa<ExtractElementInst>(V) ||
6355 areAllUsersVectorized(cast<Instruction>(V),
6359 if (PWSz == VL.size()) {
6360 ReuseShuffleIndicies.
clear();
6362 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6363 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6364 UniqueValues.
back());
6365 VL = NonUniqueValueVL;
6370 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6381 if (!EphValues.
empty()) {
6382 for (
Value *V : VL) {
6383 if (EphValues.
count(V)) {
6385 <<
") is ephemeral.\n");
6386 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6396 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6401 cast<Instruction>(
I)->getOpcode() ==
6402 cast<Instruction>(S.MainOp)->getOpcode();
6404 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6405 if (TryToFindDuplicates(S))
6406 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6407 ReuseShuffleIndicies);
6412 if (S.getOpcode() == Instruction::ExtractElement &&
6413 isa<ScalableVectorType>(
6414 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6415 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6416 if (TryToFindDuplicates(S))
6417 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6418 ReuseShuffleIndicies);
6423 if (S.OpValue->getType()->isVectorTy() &&
6424 !isa<InsertElementInst>(S.OpValue)) {
6426 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6430 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6431 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6432 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6433 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6442 auto &&NotProfitableForVectorization = [&S,
this,
6444 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6453 for (
Value *V : VL) {
6454 auto *
I = cast<Instruction>(V);
6456 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6460 if ((IsCommutative &&
6461 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6463 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6465 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6467 auto *
I1 = cast<Instruction>(VL.front());
6468 auto *I2 = cast<Instruction>(VL.back());
6469 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6471 I2->getOperand(
Op));
6472 if (
static_cast<unsigned>(
count_if(
6473 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6475 })) >= S.MainOp->getNumOperands() / 2)
6477 if (S.MainOp->getNumOperands() > 2)
6479 if (IsCommutative) {
6482 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6484 I2->getOperand((
Op + 1) %
E));
6486 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6495 bool IsScatterVectorizeUserTE =
6496 UserTreeIdx.UserTE &&
6497 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6498 bool AreAllSameInsts =
6500 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6504 auto *
I = dyn_cast<GetElementPtrInst>(V);
6508 BB =
I->getParent();
6509 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6515 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6518 NotProfitableForVectorization(VL)) {
6519 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6520 if (TryToFindDuplicates(S))
6521 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6522 ReuseShuffleIndicies);
6530 if (TreeEntry *
E = getTreeEntry(S.OpValue)) {
6531 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6532 if (!
E->isSame(VL)) {
6533 auto It = MultiNodeScalars.
find(S.OpValue);
6534 if (It != MultiNodeScalars.
end()) {
6535 auto *TEIt =
find_if(It->getSecond(),
6536 [&](TreeEntry *ME) { return ME->isSame(VL); });
6537 if (TEIt != It->getSecond().end())
6547 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6548 if (TryToFindDuplicates(S))
6549 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6550 ReuseShuffleIndicies);
6556 E->UserTreeIndices.push_back(UserTreeIdx);
6557 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6564 for (
Value *V : VL) {
6565 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6568 if (getTreeEntry(V)) {
6570 <<
") is already in tree.\n");
6571 if (TryToFindDuplicates(S))
6572 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6573 ReuseShuffleIndicies);
6579 if (UserIgnoreList && !UserIgnoreList->empty()) {
6580 for (
Value *V : VL) {
6581 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6582 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6583 if (TryToFindDuplicates(S))
6584 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6585 ReuseShuffleIndicies);
6593 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6594 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6596 assert(S.OpValue->getType()->isPointerTy() &&
6597 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6598 "Expected pointers only.");
6600 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6601 assert(It != VL.end() &&
"Expected at least one GEP.");
6607 auto *VL0 = cast<Instruction>(S.OpValue);
6614 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6623 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6628 if (!TryToFindDuplicates(S,
true))
6634 TreeEntry::EntryState State = getScalarsVectorizationState(
6635 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6636 if (State == TreeEntry::NeedToGather) {
6637 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6638 ReuseShuffleIndicies);
6642 auto &BSRef = BlocksSchedules[BB];
6644 BSRef = std::make_unique<BlockScheduling>(BB);
6646 BlockScheduling &BS = *BSRef;
6648 std::optional<ScheduleData *> Bundle =
6649 BS.tryScheduleBundle(UniqueValues,
this, S);
6650#ifdef EXPENSIVE_CHECKS
6655 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6656 assert((!BS.getScheduleData(VL0) ||
6657 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6658 "tryScheduleBundle should cancelScheduling on failure");
6659 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6660 ReuseShuffleIndicies);
6661 NonScheduledFirst.insert(VL.front());
6664 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6666 unsigned ShuffleOrOp = S.isAltShuffle() ?
6667 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6668 switch (ShuffleOrOp) {
6669 case Instruction::PHI: {
6670 auto *PH = cast<PHINode>(VL0);
6673 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6678 for (
unsigned I = 0,
E = PH->getNumIncomingValues();
I <
E; ++
I) {
6688 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6689 PH->getIncomingBlock(
I)));
6693 for (
unsigned OpIdx = 0, OpE = OperandsVec.
size(); OpIdx != OpE; ++OpIdx)
6694 buildTree_rec(OperandsVec[OpIdx],
Depth + 1, {
TE, OpIdx});
6697 case Instruction::ExtractValue:
6698 case Instruction::ExtractElement: {
6699 if (CurrentOrder.empty()) {
6700 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6701 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6702 ReuseShuffleIndicies);
6706 Op0.
assign(VL.size(), VL0->getOperand(0));
6707 VectorizableTree.back()->setOperand(0, Op0);
6711 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6713 for (
unsigned Idx : CurrentOrder)
6720 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6721 ReuseShuffleIndicies, CurrentOrder);
6725 Op0.
assign(VL.size(), VL0->getOperand(0));
6726 VectorizableTree.back()->setOperand(0, Op0);
6729 case Instruction::InsertElement: {
6730 assert(ReuseShuffleIndicies.
empty() &&
"All inserts should be unique");
6732 auto OrdCompare = [](
const std::pair<int, int> &P1,
6733 const std::pair<int, int> &P2) {
6734 return P1.first > P2.first;
6737 decltype(OrdCompare)>
6738 Indices(OrdCompare);
6739 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6741 Indices.emplace(
Idx,
I);
6743 OrdersType CurrentOrder(VL.size(), VL.size());
6744 bool IsIdentity =
true;
6745 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6746 CurrentOrder[Indices.top().second] =
I;
6747 IsIdentity &= Indices.top().second ==
I;
6751 CurrentOrder.clear();
6752 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6753 std::nullopt, CurrentOrder);
6756 constexpr int NumOps = 2;
6758 for (
int I = 0;
I < NumOps; ++
I) {
6760 VectorOperands[
I].
push_back(cast<Instruction>(V)->getOperand(
I));
6762 TE->setOperand(
I, VectorOperands[
I]);
6764 buildTree_rec(VectorOperands[NumOps - 1],
Depth + 1, {
TE, NumOps - 1});
6767 case Instruction::Load: {
6774 TreeEntry *
TE =
nullptr;
6777 case TreeEntry::Vectorize:
6778 if (CurrentOrder.empty()) {
6780 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6781 ReuseShuffleIndicies);
6785 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6786 ReuseShuffleIndicies, CurrentOrder);
6789 TE->setOperandsInOrder();
6791 case TreeEntry::StridedVectorize:
6793 if (CurrentOrder.empty()) {
6794 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6795 UserTreeIdx, ReuseShuffleIndicies);
6797 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6798 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6800 TE->setOperandsInOrder();
6803 case TreeEntry::ScatterVectorize:
6805 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6806 UserTreeIdx, ReuseShuffleIndicies);
6807 TE->setOperandsInOrder();
6808 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6809 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6811 case TreeEntry::NeedToGather:
6816 case Instruction::ZExt:
6817 case Instruction::SExt:
6818 case Instruction::FPToUI:
6819 case Instruction::FPToSI:
6820 case Instruction::FPExt:
6821 case Instruction::PtrToInt:
6822 case Instruction::IntToPtr:
6823 case Instruction::SIToFP:
6824 case Instruction::UIToFP:
6825 case Instruction::Trunc:
6826 case Instruction::FPTrunc:
6827 case Instruction::BitCast: {
6828 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6829 std::make_pair(std::numeric_limits<unsigned>::min(),
6830 std::numeric_limits<unsigned>::max()));
6831 if (ShuffleOrOp == Instruction::ZExt ||
6832 ShuffleOrOp == Instruction::SExt) {
6833 CastMaxMinBWSizes = std::make_pair(
6834 std::max<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6837 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6839 }
else if (ShuffleOrOp == Instruction::Trunc) {
6840 CastMaxMinBWSizes = std::make_pair(
6842 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6844 std::min<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6846 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6847 }
else if (ShuffleOrOp == Instruction::SIToFP ||
6848 ShuffleOrOp == Instruction::UIToFP) {
6849 unsigned NumSignBits =
6851 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6853 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
6855 if (NumSignBits * 2 >=
6856 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6857 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6859 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6860 ReuseShuffleIndicies);
6863 TE->setOperandsInOrder();
6864 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6868 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6874 case Instruction::ICmp:
6875 case Instruction::FCmp: {
6878 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6879 ReuseShuffleIndicies);
6887 "Commutative Predicate mismatch");
6888 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6891 for (
Value *V : VL) {
6892 auto *
Cmp = cast<CmpInst>(V);
6895 if (
Cmp->getPredicate() != P0)
6905 if (ShuffleOrOp == Instruction::ICmp) {
6906 unsigned NumSignBits0 =
6908 if (NumSignBits0 * 2 >=
6909 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6910 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
6911 unsigned NumSignBits1 =
6913 if (NumSignBits1 * 2 >=
6914 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6915 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
6919 case Instruction::Select:
6920 case Instruction::FNeg:
6921 case Instruction::Add:
6922 case Instruction::FAdd:
6923 case Instruction::Sub:
6924 case Instruction::FSub:
6925 case Instruction::Mul:
6926 case Instruction::FMul:
6927 case Instruction::UDiv:
6928 case Instruction::SDiv:
6929 case Instruction::FDiv:
6930 case Instruction::URem:
6931 case Instruction::SRem:
6932 case Instruction::FRem:
6933 case Instruction::Shl:
6934 case Instruction::LShr:
6935 case Instruction::AShr:
6936 case Instruction::And:
6937 case Instruction::Or:
6938 case Instruction::Xor: {
6939 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6940 ReuseShuffleIndicies);
6947 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6955 TE->setOperandsInOrder();
6956 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6960 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6966 case Instruction::GetElementPtr: {
6967 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6968 ReuseShuffleIndicies);
6972 for (
Value *V : VL) {
6973 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6978 Operands.front().push_back(
GEP->getPointerOperand());
6987 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6989 [VL0Ty, IndexIdx](
Value *V) {
6990 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6993 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
6996 :
DL->getIndexType(cast<GetElementPtrInst>(VL0)
6997 ->getPointerOperandType()
7000 for (
Value *V : VL) {
7001 auto *
I = dyn_cast<GetElementPtrInst>(V);
7004 ConstantInt::get(Ty, 0,
false));
7007 auto *
Op =
I->getOperand(IndexIdx);
7008 auto *CI = dyn_cast<ConstantInt>(
Op);
7013 CI, Ty, CI->getValue().isSignBitSet(), *
DL));
7017 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7021 case Instruction::Store: {
7025 for (
Value *V : VL) {
7026 auto *
SI = cast<StoreInst>(V);
7027 *OIter =
SI->getValueOperand();
7031 if (CurrentOrder.empty()) {
7033 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7034 ReuseShuffleIndicies);
7035 TE->setOperandsInOrder();
7040 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7041 ReuseShuffleIndicies, CurrentOrder);
7042 TE->setOperandsInOrder();
7044 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7048 case Instruction::Call: {
7051 CallInst *CI = cast<CallInst>(VL0);
7054 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7055 ReuseShuffleIndicies);
7060 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7064 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7068 for (
Value *V : VL) {
7069 auto *CI2 = cast<CallInst>(V);
7076 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7083 TE->setOperandsInOrder();
7084 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7091 for (
Value *V : VL) {
7092 auto *CI2 = cast<CallInst>(V);
7099 case Instruction::ShuffleVector: {
7100 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7101 ReuseShuffleIndicies);
7105 auto *CI = dyn_cast<CmpInst>(VL0);
7106 if (isa<BinaryOperator>(VL0) || CI) {
7109 return cast<CmpInst>(V)->isCommutative();
7111 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7113 auto *MainCI = cast<CmpInst>(S.MainOp);
7114 auto *AltCI = cast<CmpInst>(S.AltOp);
7118 "Expected different main/alternate predicates.");
7121 for (
Value *V : VL) {
7122 auto *
Cmp = cast<CmpInst>(V);
7144 TE->setOperandsInOrder();
7145 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7149 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7165 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7166 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7168 for (
const auto *Ty : ST->elements())
7169 if (Ty != *ST->element_begin())
7171 N *= ST->getNumElements();
7172 EltTy = *ST->element_begin();
7173 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7174 N *= AT->getNumElements();
7175 EltTy = AT->getElementType();
7177 auto *VT = cast<FixedVectorType>(EltTy);
7178 N *= VT->getNumElements();
7179 EltTy = VT->getElementType();
7186 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7187 VTSize !=
DL->getTypeStoreSizeInBits(
T))
7194 bool ResizeAllowed)
const {
7195 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7196 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7197 auto *E0 = cast<Instruction>(*It);
7199 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7203 Value *Vec = E0->getOperand(0);
7205 CurrentOrder.
clear();
7209 if (E0->getOpcode() == Instruction::ExtractValue) {
7214 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7218 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7221 unsigned E = VL.
size();
7222 if (!ResizeAllowed && NElts !=
E)
7225 unsigned MinIdx = NElts, MaxIdx = 0;
7227 auto *Inst = dyn_cast<Instruction>(V);
7230 if (Inst->getOperand(0) != Vec)
7232 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7233 if (isa<UndefValue>(EE->getIndexOperand()))
7238 const unsigned ExtIdx = *
Idx;
7239 if (ExtIdx >= NElts)
7241 Indices[
I] = ExtIdx;
7242 if (MinIdx > ExtIdx)
7244 if (MaxIdx < ExtIdx)
7247 if (MaxIdx - MinIdx + 1 >
E)
7249 if (MaxIdx + 1 <=
E)
7253 bool ShouldKeepOrder =
true;
7260 for (
unsigned I = 0;
I <
E; ++
I) {
7263 const unsigned ExtIdx = Indices[
I] - MinIdx;
7264 if (CurrentOrder[ExtIdx] !=
E) {
7265 CurrentOrder.
clear();
7268 ShouldKeepOrder &= ExtIdx ==
I;
7269 CurrentOrder[ExtIdx] =
I;
7271 if (ShouldKeepOrder)
7272 CurrentOrder.
clear();
7274 return ShouldKeepOrder;
7277bool BoUpSLP::areAllUsersVectorized(
7279 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7281 return ScalarToTreeEntry.contains(U) ||
7282 isVectorLikeInstWithConstOps(U) ||
7283 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7287static std::pair<InstructionCost, InstructionCost>
7295 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7296 FMF = FPCI->getFastMathFlags();
7299 dyn_cast<IntrinsicInst>(CI));
7300 auto IntrinsicCost =
7307 auto LibCost = IntrinsicCost;
7314 return {IntrinsicCost, LibCost};
7317void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7321 unsigned Sz = Scalars.size();
7324 if (!ReorderIndices.empty())
7326 for (
unsigned I = 0;
I < Sz; ++
I) {
7328 if (!ReorderIndices.empty())
7330 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7331 if (IsAltOp(OpInst)) {
7341 if (!ReuseShuffleIndices.empty()) {
7344 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7354 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7355 auto *AltCI = cast<CmpInst>(AltOp);
7358 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7359 auto *CI = cast<CmpInst>(
I);
7367 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7368 "CmpInst expected to match either main or alternate predicate or "
7371 return MainP !=
P && MainP != SwappedP;
7378 const auto *Op0 = Ops.
front();
7384 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7388 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7390 if (
auto *CI = dyn_cast<ConstantInt>(V))
7391 return CI->getValue().isPowerOf2();
7394 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7396 if (
auto *CI = dyn_cast<ConstantInt>(V))
7397 return CI->getValue().isNegatedPowerOf2();
7402 if (IsConstant && IsUniform)
7404 else if (IsConstant)
7418class BaseShuffleAnalysis {
7425 int Limit =
Mask.size();
7437 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7453 unsigned VF =
Mask.size();
7455 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7458 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7499 bool SinglePermute) {
7503 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7505 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7511 if (isIdentityMask(Mask, SVTy,
false)) {
7512 if (!IdentityOp || !SinglePermute ||
7513 (isIdentityMask(Mask, SVTy,
true) &&
7515 IdentityMask.
size()))) {
7520 IdentityMask.
assign(Mask);
7540 if (SV->isZeroEltSplat()) {
7542 IdentityMask.
assign(Mask);
7544 int LocalVF =
Mask.size();
7546 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7547 LocalVF = SVOpTy->getNumElements();
7551 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7553 ExtMask[
Idx] = SV->getMaskValue(
I);
7563 if (!IsOp1Undef && !IsOp2Undef) {
7565 for (
int &
I : Mask) {
7568 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7575 SV->getShuffleMask().end());
7576 combineMasks(LocalVF, ShuffleMask, Mask);
7577 Mask.swap(ShuffleMask);
7579 Op = SV->getOperand(0);
7581 Op = SV->getOperand(1);
7583 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7584 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7589 "Expected masks of same sizes.");
7594 Mask.swap(IdentityMask);
7595 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7596 return SinglePermute &&
7597 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7599 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7600 Shuffle->isZeroEltSplat() &&
7613 template <
typename T,
typename ShuffleBuilderTy>
7615 ShuffleBuilderTy &Builder) {
7616 assert(V1 &&
"Expected at least one vector value.");
7618 Builder.resizeToMatch(V1, V2);
7619 int VF =
Mask.size();
7620 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7621 VF = FTy->getNumElements();
7628 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7631 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7633 CombinedMask1[
I] =
Mask[
I];
7635 CombinedMask2[
I] =
Mask[
I] - VF;
7642 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7643 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7646 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7647 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7652 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7655 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7657 ExtMask1, UseMask::SecondArg);
7662 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7665 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7667 ExtMask2, UseMask::SecondArg);
7668 if (SV1->getOperand(0)->getType() ==
7669 SV2->getOperand(0)->getType() &&
7670 SV1->getOperand(0)->getType() != SV1->getType() &&
7673 Op1 = SV1->getOperand(0);
7674 Op2 = SV2->getOperand(0);
7676 SV1->getShuffleMask().end());
7677 int LocalVF = ShuffleMask1.size();
7678 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7679 LocalVF = FTy->getNumElements();
7680 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7681 CombinedMask1.swap(ShuffleMask1);
7683 SV2->getShuffleMask().end());
7684 LocalVF = ShuffleMask2.size();
7685 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7686 LocalVF = FTy->getNumElements();
7687 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7688 CombinedMask2.swap(ShuffleMask2);
7691 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7692 Builder.resizeToMatch(Op1, Op2);
7693 VF = std::max(cast<VectorType>(Op1->
getType())
7695 .getKnownMinValue(),
7696 cast<VectorType>(Op2->
getType())
7698 .getKnownMinValue());
7699 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7702 "Expected undefined mask element");
7703 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7709 isa<ShuffleVectorInst>(Op1) &&
7710 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7712 return Builder.createIdentity(Op1);
7713 return Builder.createShuffleVector(
7717 if (isa<PoisonValue>(V1))
7718 return Builder.createPoison(
7719 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7721 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7722 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7725 return Builder.createShuffleVector(V1, NewMask);
7726 return Builder.createIdentity(V1);
7742 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7745 Mask, NumSrcElts, NumSubElts,
Index)) {
7746 if (
Index + NumSubElts > NumSrcElts &&
7747 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7757static std::pair<InstructionCost, InstructionCost>
7768 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7778 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7782 for (
Value *V : Ptrs) {
7787 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7792 if (!
Ptr || !
Ptr->hasOneUse())
7796 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7802 TTI::PointersChainInfo::getKnownStride(),
7812 [](
const Value *V) {
7813 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7814 return Ptr && !
Ptr->hasAllConstantIndices();
7816 ? TTI::PointersChainInfo::getUnknownStride()
7817 : TTI::PointersChainInfo::getKnownStride();
7821 if (
auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7824 BaseGEP->getPointerOperand(), Indices, VecTy,
7829 return std::make_pair(ScalarCost, VecCost);
7834 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7835 TreeEntry &E = *TE.get();
7836 switch (E.getOpcode()) {
7837 case Instruction::Load: {
7838 Type *ScalarTy = E.getMainOp()->getType();
7840 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7847 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7854 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7855 false, CommonAlignment,
CostKind, BaseLI);
7856 if (StridedCost < OriginalVecCost)
7859 E.State = TreeEntry::StridedVectorize;
7876 bool IsFinalized =
false;
7889 bool SameNodesEstimated =
true;
7898 if (
auto *VTy = dyn_cast<VectorType>(Ty))
7914 const unsigned Sz = R.DL->getTypeSizeInBits(VL.
front()->getType());
7915 unsigned MinVF = R.getMinVF(2 * Sz);
7916 if (VL.
size() > 2 &&
7917 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7918 (InVectors.
empty() &&
7921 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7922 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7923 return S.getOpcode() == Instruction::Load &&
7926 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
7932 unsigned StartIdx = 0;
7933 unsigned VF = VL.
size() / 2;
7934 for (; VF >= MinVF; VF /= 2) {
7935 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
7938 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7940 if (SliceS.getOpcode() != Instruction::Load ||
7941 SliceS.isAltShuffle())
7949 CurrentOrder, PointerOps);
7959 CurrentOrder.
empty()) ||
7968 if (Cnt == StartIdx)
7977 if (StartIdx >= VL.
size())
7980 if (!VectorizedLoads.
empty())
7983 if (!VectorizedLoads.
empty()) {
7985 bool NeedInsertSubvectorAnalysis =
7986 !NumParts || (VL.
size() / VF) > NumParts;
7992 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
7999 for (
Value *V : VectorizedLoads) {
8000 auto *LI = cast<LoadInst>(V);
8007 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
8008 auto *LI = cast<LoadInst>(VL[
P.first]);
8017 false, Alignment, CostKind, LI);
8021 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8022 auto [ScalarGEPCost, VectorGEPCost] =
8024 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8025 GatherCost += VectorGEPCost - ScalarGEPCost;
8027 for (
unsigned P : ScatterVectorized) {
8028 auto *LI0 = cast<LoadInst>(VL[
P]);
8030 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8032 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8033 false, CommonAlignment, CostKind, LI0);
8037 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8045 auto [ScalarGEPCost, VectorGEPCost] =
8047 CostKind, ScalarTy, VecTy);
8048 GatherCost += VectorGEPCost - ScalarGEPCost;
8049 if (!Order.
empty()) {
8053 VecTy, Mask, CostKind);
8056 GatherCost += R.getGatherCost(PointerOps,
true);
8059 if (NeedInsertSubvectorAnalysis) {
8062 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8063 for (
unsigned Idx : seq<unsigned>(0, E))
8066 ShuffleMask, CostKind,
I, LoadTy);
8069 GatherCost -= ScalarsCost;
8071 GatherCost = std::min(BaseCost, GatherCost);
8072 }
else if (!Root &&
isSplat(VL)) {
8075 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8076 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8079 count(VL, *It) > 1 &&
8083 CostKind, std::distance(VL.
begin(), It),
8088 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8091 Instruction::InsertElement, VecTy, CostKind, 0,
8095 ShuffleMask, CostKind, 0,
8099 (
all_of(Gathers, IsaPred<UndefValue>)
8101 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers)));
8108 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8109 unsigned NumParts) {
8110 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8112 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8113 auto *EE = dyn_cast<ExtractElementInst>(V);
8116 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8119 return std::max(Sz, VecTy->getNumElements());
8123 if (NumSrcRegs == 0)
8128 auto CheckPerRegistersShuffle =
8133 int FirstRegId = -1;
8134 for (
int &
I : Mask) {
8137 int RegId = (
I / NumElts) * NumParts + (
I % NumElts) / EltsPerVector;
8140 RegIndices.
insert(RegId);
8141 if (RegIndices.
size() > 2)
8142 return std::nullopt;
8143 if (RegIndices.
size() == 2)
8145 I = (
I % NumElts) % EltsPerVector +
8146 (RegId == FirstRegId ? 0 : EltsPerVector);
8155 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8156 if (!ShuffleKinds[Part])
8159 Mask.slice(Part * EltsPerVector,
8160 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8161 ? Mask.size() % EltsPerVector
8165 std::optional<TTI::ShuffleKind> RegShuffleKind =
8166 CheckPerRegistersShuffle(SubMask);
8167 if (!RegShuffleKind) {
8169 TTI, *ShuffleKinds[Part],
8176 TTI, *RegShuffleKind,
8187 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8194 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8196 unsigned SliceSize) {
8197 if (SameNodesEstimated) {
8203 if ((InVectors.
size() == 2 &&
8204 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8205 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8206 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8209 "Expected all poisoned elements.");
8212 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8217 Cost += createShuffle(InVectors.
front(),
8218 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8220 transformMaskAfterShuffle(CommonMask, CommonMask);
8222 SameNodesEstimated =
false;
8223 if (!E2 && InVectors.
size() == 1) {
8224 unsigned VF = E1.getVectorFactor();
8227 cast<FixedVectorType>(V1->
getType())->getNumElements());
8229 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8230 VF = std::max(VF, E->getVectorFactor());
8232 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8234 CommonMask[
Idx] = Mask[
Idx] + VF;
8235 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8236 transformMaskAfterShuffle(CommonMask, CommonMask);
8238 Cost += createShuffle(&E1, E2, Mask);
8239 transformMaskAfterShuffle(CommonMask, Mask);
8243 class ShuffleCostBuilder {
8246 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8248 return Mask.empty() ||
8249 (VF == Mask.size() &&
8257 ~ShuffleCostBuilder() =
default;
8262 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8263 if (isEmptyOrIdentity(Mask, VF))
8266 cast<VectorType>(V1->
getType()), Mask);
8271 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8272 if (isEmptyOrIdentity(Mask, VF))
8275 cast<VectorType>(V1->
getType()), Mask);
8281 void resizeToMatch(
Value *&,
Value *&)
const {}
8291 ShuffleCostBuilder Builder(
TTI);
8294 unsigned CommonVF = Mask.size();
8295 if (!V1 && !V2 && !P2.
isNull()) {
8297 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8298 unsigned VF = E->getVectorFactor();
8299 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8300 CommonVF = std::max(VF, E2->getVectorFactor());
8303 return Idx < 2 * static_cast<int>(CommonVF);
8305 "All elements in mask must be less than 2 * CommonVF.");
8306 if (E->Scalars.size() == E2->Scalars.size()) {
8310 for (
int &
Idx : CommonMask) {
8313 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8315 else if (
Idx >=
static_cast<int>(CommonVF))
8316 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8320 CommonVF = E->Scalars.size();
8324 V2 = getAllOnesValue(
8326 }
else if (!V1 && P2.
isNull()) {
8328 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8329 unsigned VF = E->getVectorFactor();
8333 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8334 "All elements in mask must be less than CommonVF.");
8335 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8337 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8338 for (
int &
Idx : CommonMask) {
8342 CommonVF = E->Scalars.size();
8347 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8348 CommonVF == CommonMask.
size() &&
8350 [](
const auto &&
P) {
8352 static_cast<unsigned>(
P.value()) !=
P.index();
8360 }
else if (V1 && P2.
isNull()) {
8362 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8365 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8366 "All elements in mask must be less than CommonVF.");
8367 }
else if (V1 && !V2) {
8369 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8370 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8371 CommonVF = std::max(VF, E2->getVectorFactor());
8374 return Idx < 2 * static_cast<int>(CommonVF);
8376 "All elements in mask must be less than 2 * CommonVF.");
8377 if (E2->Scalars.size() == VF && VF != CommonVF) {
8379 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8380 for (
int &
Idx : CommonMask) {
8383 if (
Idx >=
static_cast<int>(CommonVF))
8384 Idx = E2Mask[
Idx - CommonVF] + VF;
8390 V2 = getAllOnesValue(
8393 }
else if (!V1 && V2) {
8395 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8396 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8397 CommonVF = std::max(VF, E1->getVectorFactor());
8400 return Idx < 2 * static_cast<int>(CommonVF);
8402 "All elements in mask must be less than 2 * CommonVF.");
8403 if (E1->Scalars.size() == VF && VF != CommonVF) {
8405 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8406 for (
int &
Idx : CommonMask) {
8409 if (
Idx >=
static_cast<int>(CommonVF))
8410 Idx = E1Mask[
Idx - CommonVF] + VF;
8418 V2 = getAllOnesValue(
8422 assert(V1 && V2 &&
"Expected both vectors.");
8423 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8425 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8428 return Idx < 2 * static_cast<int>(CommonVF);
8430 "All elements in mask must be less than 2 * CommonVF.");
8431 if (V1->
getType() != V2->getType()) {
8433 cast<FixedVectorType>(V1->
getType())->getElementType(), CommonVF));
8434 V2 = getAllOnesValue(
8436 cast<FixedVectorType>(V1->
getType())->getElementType(),
8441 cast<FixedVectorType>(V1->
getType())->getElementType(),
8442 CommonMask.
size()));
8443 if (InVectors.
size() == 2)
8445 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8446 V1, V2, CommonMask, Builder);
8453 :
TTI(
TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8454 R(R), CheckedExtracts(CheckedExtracts) {}
8456 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8457 unsigned NumParts,
bool &UseVecBaseAsInput) {
8458 UseVecBaseAsInput =
false;
8461 Value *VecBase =
nullptr;
8464 if (NumParts == VL.
size())
8468 bool PrevNodeFound =
any_of(
8470 [&](
const std::unique_ptr<TreeEntry> &TE) {
8471 return ((!TE->isAltShuffle() &&
8472 TE->getOpcode() == Instruction::ExtractElement) ||
8473 TE->State == TreeEntry::NeedToGather) &&
8474 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8475 return VL.size() > Data.index() &&
8476 (Mask[Data.index()] == PoisonMaskElem ||
8477 isa<UndefValue>(VL[Data.index()]) ||
8478 Data.value() == VL[Data.index()]);
8482 unsigned SliceSize = VL.
size() / NumParts;
8483 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8484 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8485 for (
auto [
I, V] :
enumerate(VL.
slice(Part * SliceSize, SliceSize))) {
8487 if (isa<UndefValue>(V) ||
8496 auto *EE = cast<ExtractElementInst>(V);
8497 VecBase = EE->getVectorOperand();
8498 UniqueBases.
insert(VecBase);
8499 const TreeEntry *VE = R.getTreeEntry(V);
8500 if (!CheckedExtracts.
insert(V).second ||
8501 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8504 return isa<GetElementPtrInst>(U) &&
8505 !R.areAllUsersVectorized(cast<Instruction>(U),
8513 unsigned Idx = *EEIdx;
8515 if (EE->hasOneUse() || !PrevNodeFound) {
8517 if (isa<SExtInst, ZExtInst>(Ext) &&
8518 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8523 EE->getVectorOperandType(),
Idx);
8526 Ext->getOpcode(), Ext->getType(), EE->getType(),
8542 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8545 transformMaskAfterShuffle(CommonMask, CommonMask);
8546 SameNodesEstimated =
false;
8547 if (NumParts != 1 && UniqueBases.
size() != 1) {
8548 UseVecBaseAsInput =
true;
8556 std::optional<InstructionCost>
8560 return std::nullopt;
8566 return Idx < static_cast<int>(E1.getVectorFactor());
8568 "Expected single vector shuffle mask.");
8572 if (InVectors.
empty()) {
8573 CommonMask.
assign(Mask.begin(), Mask.end());
8574 InVectors.
assign({&E1, &E2});
8577 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8581 if (NumParts == 0 || NumParts >= Mask.size())
8583 unsigned SliceSize = Mask.size() / NumParts;
8586 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8587 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8590 if (InVectors.
empty()) {
8591 CommonMask.
assign(Mask.begin(), Mask.end());
8592 InVectors.
assign(1, &E1);
8595 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8599 if (NumParts == 0 || NumParts >= Mask.size())
8601 unsigned SliceSize = Mask.size() / NumParts;
8604 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8605 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8606 if (!SameNodesEstimated && InVectors.
size() == 1)
8619 cast<ExtractElementInst>(InVectors.
front()
8620 .get<
const TreeEntry *>()
8621 ->Scalars[
P.index()]);
8622 return EI->getVectorOperand() == V1 ||
8623 EI->getVectorOperand() == V2;
8625 "Expected extractelement vectors.");
8629 if (InVectors.
empty()) {
8631 "Expected empty input mask/vectors.");
8632 CommonMask.
assign(Mask.begin(), Mask.end());
8639 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8643 .get<const TreeEntry *>()
8644 ->Scalars[
P.index()];
8646 return P.value() == Mask[
P.index()] ||
8647 isa<UndefValue>(Scalar);
8648 if (isa<Constant>(V1))
8650 auto *EI = cast<ExtractElementInst>(Scalar);
8651 return EI->getVectorOperand() == V1;
8653 "Expected only tree entry for extractelement vectors.");
8657 "Expected only tree entries from extracts/reused buildvectors.");
8658 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8659 if (InVectors.
size() == 2) {
8660 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8661 transformMaskAfterShuffle(CommonMask, CommonMask);
8662 VF = std::max<unsigned>(VF, CommonMask.
size());
8663 }
else if (
const auto *InTE =
8664 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8665 VF = std::max(VF, InTE->getVectorFactor());
8669 ->getNumElements());
8672 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8674 CommonMask[
Idx] = Mask[
Idx] + VF;
8677 Value *Root =
nullptr) {
8678 Cost += getBuildVectorCost(VL, Root);
8682 unsigned VF = VL.
size();
8684 VF = std::min(VF, MaskVF);
8686 if (isa<UndefValue>(V)) {
8696 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8697 getAllOnesValue(*R.DL, VL.
front()->getType()));
8707 if (InVectors.
size() == 2)
8708 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8710 Cost += createShuffle(Vec,
nullptr, CommonMask);
8711 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8715 "Expected vector length for the final value before action.");
8717 Action(V, CommonMask);
8718 InVectors.
front() = V;
8721 if (CommonMask.
empty()) {
8722 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
8726 createShuffle(InVectors.
front(),
8727 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
8733 "Shuffle construction must be finalized.");
8737const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
8738 unsigned Idx)
const {
8740 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
8741 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8742 return EI.EdgeIdx == Idx && EI.UserTE == E;
8743 }) != TE->UserTreeIndices.end())
8745 auto MIt = MultiNodeScalars.
find(
Op);
8746 if (MIt != MultiNodeScalars.
end()) {
8747 for (
const TreeEntry *TE : MIt->second) {
8748 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8749 return EI.EdgeIdx == Idx && EI.UserTE == E;
8750 }) != TE->UserTreeIndices.end())
8756 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8757 return TE->State == TreeEntry::NeedToGather &&
8758 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8759 return EI.EdgeIdx == Idx && EI.UserTE == E;
8760 }) !=
TE->UserTreeIndices.end();
8762 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
8767 if (
TE.State == TreeEntry::ScatterVectorize ||
8768 TE.State == TreeEntry::StridedVectorize)
8770 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
8771 !
TE.isAltShuffle()) {
8772 if (
TE.ReorderIndices.empty())
8811 Type *ScalarTy = VL[0]->getType();
8812 if (E->State != TreeEntry::NeedToGather) {
8813 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
8814 ScalarTy =
SI->getValueOperand()->getType();
8815 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
8817 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8818 ScalarTy =
IE->getOperand(1)->getType();
8827 auto It = MinBWs.
find(E);
8828 Type *OrigScalarTy = ScalarTy;
8829 if (It != MinBWs.
end()) {
8833 unsigned EntryVF = E->getVectorFactor();
8836 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8837 if (E->State == TreeEntry::NeedToGather) {
8840 if (isa<InsertElementInst>(VL[0]))
8842 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8843 E, *
TTI, VectorizedVals, *
this, CheckedExtracts);
8848 if (!E->ReorderIndices.empty() &&
8849 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8851 if (E->getOpcode() == Instruction::Store) {
8853 NewMask.
resize(E->ReorderIndices.size());
8854 copy(E->ReorderIndices, NewMask.
begin());
8860 if (NeedToShuffleReuses)
8861 ::addMask(Mask, E->ReuseShuffleIndices);
8865 assert((E->State == TreeEntry::Vectorize ||
8866 E->State == TreeEntry::ScatterVectorize ||
8867 E->State == TreeEntry::StridedVectorize) &&
8871 (E->getOpcode() == Instruction::GetElementPtr &&
8872 E->getMainOp()->getType()->isPointerTy())) &&
8875 unsigned ShuffleOrOp =
8876 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
8878 const unsigned Sz = UniqueValues.
size();
8880 for (
unsigned I = 0;
I < Sz; ++
I) {
8881 if (getTreeEntry(UniqueValues[
I]) == E)
8885 auto GetCastContextHint = [&](
Value *
V) {
8886 if (
const TreeEntry *OpTE = getTreeEntry(V))
8887 return getCastContextHint(*OpTE);
8888 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
8889 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8898 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8902 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8904 for (
unsigned I = 0;
I < Sz; ++
I) {
8905 if (UsedScalars.test(
I))
8907 ScalarCost += ScalarEltCost(
I);
8915 const EdgeInfo &EI = E->UserTreeIndices.front();
8916 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8918 It != MinBWs.
end()) {
8919 auto UserBWIt = MinBWs.
find(EI.UserTE);
8920 Type *UserScalarTy =
8921 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8922 if (UserBWIt != MinBWs.
end())
8924 UserBWIt->second.first);
8925 if (ScalarTy != UserScalarTy) {
8926 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
8927 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
8932 VecOpcode = Instruction::Trunc;
8935 It->second.second ? Instruction::SExt : Instruction::ZExt;
8942 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8943 ScalarCost,
"Calculated costs for Tree"));
8944 return VecCost - ScalarCost;
8949 assert((E->State == TreeEntry::Vectorize ||
8950 E->State == TreeEntry::StridedVectorize) &&
8951 "Entry state expected to be Vectorize or StridedVectorize here.");
8955 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
8956 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8957 "Calculated GEPs cost for Tree"));
8959 return VecCost - ScalarCost;
8962 switch (ShuffleOrOp) {
8963 case Instruction::PHI: {
8967 for (
Value *V : UniqueValues) {
8968 auto *
PHI = dyn_cast<PHINode>(V);
8973 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
8977 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
8979 if (!OpTE->ReuseShuffleIndices.empty())
8980 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8981 OpTE->Scalars.size());
8984 return CommonCost - ScalarCost;
8986 case Instruction::ExtractValue:
8987 case Instruction::ExtractElement: {
8988 auto GetScalarCost = [&](
unsigned Idx) {
8989 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
8991 if (ShuffleOrOp == Instruction::ExtractElement) {
8992 auto *EE = cast<ExtractElementInst>(
I);
8993 SrcVecTy = EE->getVectorOperandType();
8995 auto *EV = cast<ExtractValueInst>(
I);
8996 Type *AggregateTy = EV->getAggregateOperand()->getType();
8998 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8999 NumElts = ATy->getNumElements();
9004 if (
I->hasOneUse()) {
9006 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9007 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
9014 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9022 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9023 return GetCostDiff(GetScalarCost, GetVectorCost);
9025 case Instruction::InsertElement: {
9026 assert(E->ReuseShuffleIndices.empty() &&
9027 "Unique insertelements only are expected.");
9028 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9029 unsigned const NumElts = SrcVecTy->getNumElements();
9030 unsigned const NumScalars = VL.
size();
9036 unsigned OffsetEnd = OffsetBeg;
9037 InsertMask[OffsetBeg] = 0;
9040 if (OffsetBeg >
Idx)
9042 else if (OffsetEnd <
Idx)
9044 InsertMask[
Idx] =
I + 1;
9048 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9049 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9051 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9052 unsigned InsertVecSz = std::min<unsigned>(
9054 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9055 bool IsWholeSubvector =
9056 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9060 if (OffsetBeg + InsertVecSz > VecSz) {
9063 InsertVecSz = VecSz;
9069 if (!E->ReorderIndices.empty()) {
9074 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9076 bool IsIdentity =
true;
9078 Mask.swap(PrevMask);
9079 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9081 DemandedElts.
setBit(InsertIdx);
9082 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9083 Mask[InsertIdx - OffsetBeg] =
I;
9085 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9100 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9101 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9109 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9110 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9111 if (InsertVecSz != VecSz) {
9123 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9132 case Instruction::ZExt:
9133 case Instruction::SExt:
9134 case Instruction::FPToUI:
9135 case Instruction::FPToSI:
9136 case Instruction::FPExt:
9137 case Instruction::PtrToInt:
9138 case Instruction::IntToPtr:
9139 case Instruction::SIToFP:
9140 case Instruction::UIToFP:
9141 case Instruction::Trunc:
9142 case Instruction::FPTrunc:
9143 case Instruction::BitCast: {
9144 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9147 unsigned Opcode = ShuffleOrOp;
9148 unsigned VecOpcode = Opcode;
9150 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9152 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9153 if (SrcIt != MinBWs.
end()) {
9154 SrcBWSz = SrcIt->second.first;
9158 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9159 if (BWSz == SrcBWSz) {
9160 VecOpcode = Instruction::BitCast;
9161 }
else if (BWSz < SrcBWSz) {
9162 VecOpcode = Instruction::Trunc;
9163 }
else if (It != MinBWs.
end()) {
9164 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9165 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9166 }
else if (SrcIt != MinBWs.
end()) {
9167 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9169 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9171 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9172 !SrcIt->second.second) {
9173 VecOpcode = Instruction::UIToFP;
9176 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9184 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9186 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9190 VecOpcode == Opcode ? VI :
nullptr);
9192 return GetCostDiff(GetScalarCost, GetVectorCost);
9194 case Instruction::FCmp:
9195 case Instruction::ICmp:
9196 case Instruction::Select: {
9200 match(VL0, MatchCmp))
9206 auto GetScalarCost = [&](
unsigned Idx) {
9207 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9213 !
match(VI, MatchCmp)) ||
9214 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9220 Builder.getInt1Ty(), CurrentPred,
CostKind,
9227 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9239 if (IntrinsicAndUse.second)
9242 VecCost = std::min(VecCost, IntrinsicCost);
9244 return VecCost + CommonCost;
9246 return GetCostDiff(GetScalarCost, GetVectorCost);
9248 case Instruction::FNeg:
9249 case Instruction::Add:
9250 case Instruction::FAdd:
9251 case Instruction::Sub:
9252 case Instruction::FSub:
9253 case Instruction::Mul:
9254 case Instruction::FMul:
9255 case Instruction::UDiv:
9256 case Instruction::SDiv:
9257 case Instruction::FDiv:
9258 case Instruction::URem:
9259 case Instruction::SRem:
9260 case Instruction::FRem:
9261 case Instruction::Shl:
9262 case Instruction::LShr:
9263 case Instruction::AShr:
9264 case Instruction::And:
9265 case Instruction::Or:
9266 case Instruction::Xor: {
9267 auto GetScalarCost = [&](
unsigned Idx) {
9268 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9269 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9278 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9282 Op2Info, std::nullopt,
nullptr, TLI) +
9285 return GetCostDiff(GetScalarCost, GetVectorCost);
9287 case Instruction::GetElementPtr: {
9288 return CommonCost + GetGEPCostDiff(VL, VL0);
9290 case Instruction::Load: {
9291 auto GetScalarCost = [&](
unsigned Idx) {
9292 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9294 VI->getAlign(),
VI->getPointerAddressSpace(),
9297 auto *LI0 = cast<LoadInst>(VL0);
9300 if (E->State == TreeEntry::Vectorize) {
9302 Instruction::Load, VecTy, LI0->getAlign(),
9304 }
else if (E->State == TreeEntry::StridedVectorize) {
9305 Align CommonAlignment =
9306 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9308 Instruction::Load, VecTy, LI0->getPointerOperand(),
9311 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9312 Align CommonAlignment =
9313 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9315 Instruction::Load, VecTy, LI0->getPointerOperand(),
9318 return VecLdCost + CommonCost;
9324 if (E->State == TreeEntry::ScatterVectorize)
9330 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9331 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9333 case Instruction::Store: {
9334 bool IsReorder = !E->ReorderIndices.empty();
9335 auto GetScalarCost = [=](
unsigned Idx) {
9336 auto *
VI = cast<StoreInst>(VL[
Idx]);
9339 VI->getAlign(),
VI->getPointerAddressSpace(),
9343 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9348 BaseSI->getPointerAddressSpace(),
CostKind,
9354 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9355 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9358 return GetCostDiff(GetScalarCost, GetVectorCost) +
9359 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9361 case Instruction::Call: {
9362 auto GetScalarCost = [&](
unsigned Idx) {
9363 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9374 auto *CI = cast<CallInst>(VL0);
9378 It != MinBWs.
end() ? It->second.first : 0);
9380 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9382 return GetCostDiff(GetScalarCost, GetVectorCost);
9384 case Instruction::ShuffleVector: {
9385 assert(E->isAltShuffle() &&
9390 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9391 "Invalid Shuffle Vector Operand");
9394 auto TryFindNodeWithEqualOperands = [=]() {
9395 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9398 if (
TE->isAltShuffle() &&
9399 ((
TE->getOpcode() == E->getOpcode() &&
9400 TE->getAltOpcode() == E->getAltOpcode()) ||
9401 (
TE->getOpcode() == E->getAltOpcode() &&
9402 TE->getAltOpcode() == E->getOpcode())) &&
9403 TE->hasEqualOperands(*E))
9408 auto GetScalarCost = [&](
unsigned Idx) {
9409 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9410 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9420 if (TryFindNodeWithEqualOperands()) {
9422 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9429 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9431 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9432 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9434 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9435 CI0->getPredicate(),
CostKind, VL0);
9436 VecCost += TTIRef.getCmpSelInstrCost(
9437 E->getOpcode(), VecTy, MaskTy,
9438 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9441 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9444 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9445 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9447 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9448 if (SrcIt != MinBWs.
end()) {
9449 SrcBWSz = SrcIt->second.first;
9453 if (BWSz <= SrcBWSz) {
9456 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9460 <<
"SLP: alternate extension, which should be truncated.\n";
9466 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9469 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9473 E->buildAltOpShuffleMask(
9475 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9476 return I->getOpcode() == E->getAltOpcode();
9485 unsigned Opcode0 = E->getOpcode();
9486 unsigned Opcode1 = E->getAltOpcode();
9489 for (
unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9490 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9491 OpcodeMask.set(Lane);
9494 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9496 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9497 return AltVecCost < VecCost ? AltVecCost : VecCost;
9502 return GetCostDiff(GetScalarCost, GetVectorCost);
9509bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9511 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9513 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9515 return TE->State == TreeEntry::NeedToGather &&
9517 [
this](
Value *V) { return EphValues.contains(V); }) &&
9519 TE->Scalars.size() < Limit ||
9520 ((
TE->getOpcode() == Instruction::ExtractElement ||
9521 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9523 (
TE->State == TreeEntry::NeedToGather &&
9524 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9528 if (VectorizableTree.size() == 1 &&
9529 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9531 AreVectorizableGathers(VectorizableTree[0].
get(),
9532 VectorizableTree[0]->Scalars.size()) &&
9533 VectorizableTree[0]->getVectorFactor() > 2)))
9536 if (VectorizableTree.size() != 2)
9544 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9545 AreVectorizableGathers(VectorizableTree[1].
get(),
9546 VectorizableTree[0]->Scalars.size()))
9550 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9551 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9552 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9553 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9561 bool MustMatchOrInst) {
9565 Value *ZextLoad = Root;
9566 const APInt *ShAmtC;
9567 bool FoundOr =
false;
9568 while (!isa<ConstantExpr>(ZextLoad) &&
9571 ShAmtC->
urem(8) == 0))) {
9572 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9573 ZextLoad = BinOp->getOperand(0);
9574 if (BinOp->getOpcode() == Instruction::Or)
9579 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9586 Type *SrcTy = Load->getType();
9593 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9594 << *(cast<Instruction>(Root)) <<
"\n");
9603 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9604 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9612 unsigned NumElts = Stores.
size();
9613 for (
Value *Scalar : Stores) {
9624 if (VectorizableTree.size() == 2 &&
9625 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9626 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9627 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9628 !(
isSplat(VectorizableTree[1]->Scalars) ||
9636 constexpr int Limit = 4;
9638 !VectorizableTree.empty() &&
9639 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9640 return (TE->State == TreeEntry::NeedToGather &&
9641 TE->getOpcode() != Instruction::ExtractElement &&
9642 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9643 TE->getOpcode() == Instruction::PHI;
9654 if (isFullyVectorizableTinyTree(ForReduction))
9659 bool IsAllowedSingleBVNode =
9660 VectorizableTree.size() > 1 ||
9661 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9662 !VectorizableTree.front()->isAltShuffle() &&
9663 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9664 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9666 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9667 return TE->State == TreeEntry::NeedToGather &&
9669 return isa<ExtractElementInst, UndefValue>(V) ||
9670 (IsAllowedSingleBVNode &&
9671 !V->hasNUsesOrMore(UsesLimit) &&
9672 any_of(V->users(), IsaPred<InsertElementInst>));
9677 assert(VectorizableTree.empty()
9678 ? ExternalUses.empty()
9679 :
true &&
"We shouldn't have any external users");
9691 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9704 for (
const auto &TEPtr : VectorizableTree) {
9705 if (TEPtr->State != TreeEntry::Vectorize)
9707 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9713 auto *NodeA = DT->
getNode(
A->getParent());
9714 auto *NodeB = DT->
getNode(
B->getParent());
9715 assert(NodeA &&
"Should only process reachable instructions");
9716 assert(NodeB &&
"Should only process reachable instructions");
9717 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9718 "Different nodes should have different DFS numbers");
9720 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9721 return B->comesBefore(
A);
9731 LiveValues.
erase(PrevInst);
9732 for (
auto &J : PrevInst->
operands()) {
9733 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9734 LiveValues.
insert(cast<Instruction>(&*J));
9738 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
9739 for (
auto *
X : LiveValues)
9740 dbgs() <<
" " <<
X->getName();
9741 dbgs() <<
", Looking at ";
9746 unsigned NumCalls = 0;
9750 while (InstIt != PrevInstIt) {
9752 PrevInstIt = Inst->getParent()->rbegin();
9757 if (
auto *II = dyn_cast<IntrinsicInst>(
I)) {
9758 if (II->isAssumeLikeIntrinsic())
9762 for (
auto &ArgOp : II->args())
9764 if (
auto *FPMO = dyn_cast<FPMathOperator>(II))
9765 FMF = FPMO->getFastMathFlags();
9772 if (IntrCost < CallCost)
9779 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9780 &*PrevInstIt != PrevInst)
9788 for (
auto *II : LiveValues) {
9789 auto *ScalarTy = II->getType();
9790 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9791 ScalarTy = VectorTy->getElementType();
9809 const auto *I1 = IE1;
9810 const auto *I2 = IE2;
9822 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9824 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9825 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
9827 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9828 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9836 template <
typename U>
9837 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
9840 template <
typename U>
9841 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
9859template <
typename T>
9865 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
9867 auto VMIt = std::next(ShuffleMask.begin());
9870 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9872 if (!IsBaseUndef.
all()) {
9874 std::pair<T *, bool> Res =
9875 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
9877 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
9881 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
9883 auto *V = ValueSelect::get<T *>(
Base);
9885 assert((!V || GetVF(V) == Mask.size()) &&
9886 "Expected base vector of VF number of elements.");
9887 Prev = Action(Mask, {
nullptr, Res.first});
9888 }
else if (ShuffleMask.size() == 1) {
9891 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9897 Prev = Action(Mask, {ShuffleMask.begin()->first});
9901 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9902 unsigned Vec2VF = GetVF(VMIt->first);
9903 if (Vec1VF == Vec2VF) {
9907 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9910 Mask[
I] = SecMask[
I] + Vec1VF;
9913 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9916 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9918 std::pair<T *, bool> Res2 =
9919 ResizeAction(VMIt->first, VMIt->second,
false);
9921 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9928 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
9931 Prev = Action(Mask, {Res1.first, Res2.first});
9933 VMIt = std::next(VMIt);
9935 bool IsBaseNotUndef = !IsBaseUndef.
all();
9936 (void)IsBaseNotUndef;
9938 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9940 std::pair<T *, bool> Res =
9941 ResizeAction(VMIt->first, VMIt->second,
false);
9943 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9946 "Multiple uses of scalars.");
9947 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
9952 Prev = Action(Mask, {Prev, Res.first});
9960 << VectorizableTree.size() <<
".\n");
9962 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9965 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
9966 TreeEntry &TE = *VectorizableTree[
I];
9967 if (TE.State == TreeEntry::NeedToGather) {
9968 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
9969 E && E->getVectorFactor() == TE.getVectorFactor() &&
9970 E->isSame(TE.Scalars)) {
9975 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9984 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9994 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
9995 for (ExternalUser &EU : ExternalUses) {
9997 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9998 !ExtractCostCalculated.
insert(EU.Scalar).second)
10004 if (EphValues.
count(EU.User))
10008 if (isa<FixedVectorType>(EU.Scalar->getType()))
10013 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10014 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
10015 if (!UsedInserts.
insert(VU).second)
10019 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10022 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10024 VU, cast<InsertElementInst>(Pair.first),
10026 Value *Op0 = II->getOperand(0);
10027 if (getTreeEntry(II) && !getTreeEntry(Op0))
10033 if (It == FirstUsers.
end()) {
10040 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10041 if (IEBase != EU.User &&
10042 (!IEBase->hasOneUse() ||
10046 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10049 IEBase = cast<InsertElementInst>(
Base);
10052 "InsertElementInstruction used already.");
10054 Base = IEBase->getOperand(0);
10055 }
while (E == getTreeEntry(
Base));
10058 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10062 VecId = FirstUsers.
size() - 1;
10063 auto It = MinBWs.
find(ScalarTE);
10064 if (It != MinBWs.
end() &&
10066 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10068 unsigned BWSz = It->second.first;
10069 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10070 unsigned VecOpcode;
10071 if (DstBWSz < BWSz)
10072 VecOpcode = Instruction::Trunc;
10075 It->second.second ? Instruction::SExt : Instruction::ZExt;
10081 FTy->getNumElements()),
10084 <<
" for extending externally used vector with "
10085 "non-equal minimum bitwidth.\n");
10091 VecId = std::distance(FirstUsers.
begin(), It);
10093 int InIdx = *InsertIdx;
10097 Mask[InIdx] = EU.Lane;
10098 DemandedElts[VecId].setBit(InIdx);
10106 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10107 if (!ValueToExtUses) {
10108 ValueToExtUses.emplace();
10110 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10116 if (!getTreeEntry(V))
10118 auto It = ValueToExtUses->find(V);
10119 if (It != ValueToExtUses->end()) {
10121 ExternalUses[It->second].User = nullptr;
10126 if (CanBeUsedAsGEP) {
10128 ExternalUsesAsGEPs.
insert(EU.Scalar);
10137 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10138 if (It != MinBWs.
end()) {
10141 It->second.second ? Instruction::SExt : Instruction::ZExt;
10151 if (!VectorizedVals.
empty()) {
10152 const TreeEntry &Root = *VectorizableTree.front().get();
10153 auto BWIt = MinBWs.find(&Root);
10154 if (BWIt != MinBWs.end()) {
10155 Type *DstTy = Root.Scalars.front()->getType();
10156 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10158 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10159 if (OriginalSz != SrcSz) {
10160 unsigned Opcode = Instruction::Trunc;
10161 if (OriginalSz > SrcSz)
10162 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10172 Cost += SpillCost + ExtractCost;
10176 unsigned VF =
Mask.size();
10177 unsigned VecVF =
TE->getVectorFactor();
10179 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10182 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10188 dbgs() <<
"SLP: Adding cost " <<
C
10189 <<
" for final shuffle of insertelement external users.\n";
10190 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10192 return std::make_pair(TE,
true);
10194 return std::make_pair(TE,
false);
10197 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10198 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10199 auto Vector = ShuffleMasks[
I].takeVector();
10203 assert((TEs.size() == 1 || TEs.size() == 2) &&
10204 "Expected exactly 1 or 2 tree entries.");
10205 if (TEs.size() == 1) {
10207 VF = TEs.front()->getVectorFactor();
10213 (
Data.index() < VF &&
10214 static_cast<int>(
Data.index()) ==
Data.value());
10219 <<
" for final shuffle of insertelement "
10220 "external users.\n";
10221 TEs.front()->
dump();
10222 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10228 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10229 VF = TEs.front()->getVectorFactor();
10238 <<
" for final shuffle of vector node and external "
10239 "insertelement users.\n";
10240 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10241 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10247 (void)performExtractsShuffleAction<const TreeEntry>(
10249 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10250 EstimateShufflesCost);
10252 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10254 Cost -= InsertCost;
10258 if (ReductionBitWidth != 0) {
10259 assert(UserIgnoreList &&
"Expected reduction tree.");
10260 const TreeEntry &E = *VectorizableTree.front().get();
10261 auto It = MinBWs.find(&E);
10262 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10263 unsigned SrcSize = It->second.first;
10264 unsigned DstSize = ReductionBitWidth;
10265 unsigned Opcode = Instruction::Trunc;
10266 if (SrcSize < DstSize)
10267 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10274 switch (E.getOpcode()) {
10275 case Instruction::SExt:
10276 case Instruction::ZExt:
10277 case Instruction::Trunc: {
10278 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10279 CCH = getCastContextHint(*OpTE);
10289 <<
" for final resize for reduction from " << SrcVecTy
10290 <<
" to " << DstVecTy <<
"\n";
10291 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10299 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10300 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10301 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10305 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10316std::optional<TTI::ShuffleKind>
10317BoUpSLP::tryToGatherSingleRegisterExtractElements(
10323 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10324 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10326 if (isa<UndefValue>(VL[
I]))
10330 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10331 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10340 ExtractMask.reset(*
Idx);
10345 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10349 for (
const auto &
Data : VectorOpToIdx)
10350 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
10351 .push_back(
Data.first);
10352 for (
auto &
Data : VFToVector) {
10354 return VectorOpToIdx.find(V1)->second.size() >
10355 VectorOpToIdx.find(V2)->second.size();
10360 const int UndefSz = UndefVectorExtracts.
size();
10361 unsigned SingleMax = 0;
10362 Value *SingleVec =
nullptr;
10363 unsigned PairMax = 0;
10364 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
10365 for (
auto &
Data : VFToVector) {
10367 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
10368 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10372 if (
Data.second.size() > 1)
10373 V2 = *std::next(
Data.second.begin());
10374 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
10376 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
10377 PairVec = std::make_pair(V1, V2);
10380 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10381 return std::nullopt;
10387 if (SingleMax >= PairMax && SingleMax) {
10388 for (
int Idx : VectorOpToIdx[SingleVec])
10391 for (
Value *V : {PairVec.first, PairVec.second})
10392 for (
int Idx : VectorOpToIdx[V])
10396 for (
int Idx : UndefVectorExtracts)
10400 std::optional<TTI::ShuffleKind> Res =
10406 return std::nullopt;
10410 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10411 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10412 isa<UndefValue>(GatheredExtracts[
I])) {
10416 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10417 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10418 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10433 unsigned NumParts)
const {
10434 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10437 unsigned SliceSize = VL.
size() / NumParts;
10438 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10444 std::optional<TTI::ShuffleKind> Res =
10445 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10446 ShufflesRes[Part] = Res;
10447 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10449 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10450 return Res.has_value();
10452 ShufflesRes.clear();
10453 return ShufflesRes;
10456std::optional<TargetTransformInfo::ShuffleKind>
10457BoUpSLP::isGatherShuffledSingleRegisterEntry(
10463 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10464 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10468 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10469 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10472 TEInsertBlock = TEInsertPt->
getParent();
10475 return std::nullopt;
10476 auto *NodeUI = DT->
getNode(TEInsertBlock);
10477 assert(NodeUI &&
"Should only process reachable instructions");
10479 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10493 auto *NodeEUI = DT->
getNode(InsertBlock);
10496 assert((NodeUI == NodeEUI) ==
10497 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10498 "Different nodes should have different DFS numbers");
10500 if (TEInsertPt->
getParent() != InsertBlock &&
10503 if (TEInsertPt->
getParent() == InsertBlock &&
10517 for (
Value *V : VL) {
10522 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10526 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10527 "Must contain at least single gathered value.");
10528 assert(TEPtr->UserTreeIndices.size() == 1 &&
10529 "Expected only single user of a gather node.");
10530 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10532 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10535 : &getLastInstructionInBundle(UseEI.UserTE);
10536 if (TEInsertPt == InsertPt) {
10540 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10544 if (TEUseEI.UserTE != UseEI.UserTE &&
10545 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10551 if ((TEInsertBlock != InsertPt->
getParent() ||
10552 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10553 !CheckOrdering(InsertPt))
10557 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10559 if (VTE->State != TreeEntry::Vectorize) {
10560 auto It = MultiNodeScalars.
find(V);
10561 if (It == MultiNodeScalars.
end())
10563 VTE = *It->getSecond().begin();
10565 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10566 return MTE->State == TreeEntry::Vectorize;
10568 if (MIt == It->getSecond().end())
10573 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10574 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10578 if (VToTEs.
empty())
10580 if (UsedTEs.
empty()) {
10594 if (!VToTEs.
empty()) {
10600 VToTEs = SavedVToTEs;
10609 if (UsedTEs.
size() == 2)
10611 UsedTEs.push_back(SavedVToTEs);
10618 if (UsedTEs.
empty()) {
10620 return std::nullopt;
10624 if (UsedTEs.
size() == 1) {
10627 UsedTEs.front().
end());
10628 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10629 return TE1->Idx < TE2->Idx;
10632 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10633 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10635 if (It != FirstEntries.end() &&
10636 ((*It)->getVectorFactor() == VL.size() ||
10637 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10638 TE->ReuseShuffleIndices.size() == VL.size() &&
10639 (*It)->isSame(
TE->Scalars)))) {
10640 Entries.push_back(*It);
10641 if ((*It)->getVectorFactor() == VL.size()) {
10642 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10643 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10649 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10650 if (isa<PoisonValue>(VL[
I]))
10656 Entries.push_back(FirstEntries.front());
10659 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10662 for (
const TreeEntry *TE : UsedTEs.front()) {
10663 unsigned VF =
TE->getVectorFactor();
10664 auto It = VFToTE.
find(VF);
10665 if (It != VFToTE.
end()) {
10666 if (It->second->Idx >
TE->Idx)
10667 It->getSecond() =
TE;
10674 UsedTEs.back().
end());
10675 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10676 return TE1->Idx < TE2->Idx;
10678 for (
const TreeEntry *TE : SecondEntries) {
10679 auto It = VFToTE.
find(
TE->getVectorFactor());
10680 if (It != VFToTE.
end()) {
10682 Entries.push_back(It->second);
10683 Entries.push_back(TE);
10689 if (Entries.empty()) {
10691 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10692 return TE1->Idx < TE2->Idx;
10694 Entries.push_back(SecondEntries.front());
10695 VF = std::max(Entries.front()->getVectorFactor(),
10696 Entries.back()->getVectorFactor());
10700 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
10703 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10704 auto *
PHI = cast<PHINode>(V);
10705 auto *PHI1 = cast<PHINode>(V1);
10710 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
10712 Value *In1 = PHI1->getIncomingValue(
I);
10717 if (cast<Instruction>(In)->
getParent() !=
10727 auto MightBeIgnored = [=](
Value *
V) {
10728 auto *
I = dyn_cast<Instruction>(V);
10729 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
10731 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
10736 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
10738 bool UsedInSameVTE =
false;
10739 auto It = UsedValuesEntry.
find(V1);
10740 if (It != UsedValuesEntry.
end())
10741 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
10742 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10744 cast<Instruction>(V)->getParent() ==
10745 cast<Instruction>(V1)->getParent() &&
10746 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10751 for (
int I = 0, E = VL.size();
I < E; ++
I) {
10753 auto It = UsedValuesEntry.
find(V);
10754 if (It == UsedValuesEntry.
end())
10760 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
10761 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
10763 unsigned Idx = It->second;
10770 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
10771 if (!UsedIdxs.test(
I))
10777 for (std::pair<unsigned, int> &Pair : EntryLanes)
10778 if (Pair.first ==
I)
10779 Pair.first = TempEntries.
size();
10782 Entries.swap(TempEntries);
10783 if (EntryLanes.size() == Entries.size() &&
10785 .
slice(Part * VL.size(),
10786 std::min<int>(VL.size(),
TE->Scalars.size())))) {
10792 return std::nullopt;
10795 bool IsIdentity = Entries.size() == 1;
10798 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
10799 unsigned Idx = Part * VL.size() + Pair.second;
10802 (ForOrder ? std::distance(
10803 Entries[Pair.first]->Scalars.begin(),
10804 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10805 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10806 IsIdentity &=
Mask[
Idx] == Pair.second;
10808 switch (Entries.size()) {
10810 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10814 if (EntryLanes.size() > 2 || VL.size() <= 2)
10822 std::fill(std::next(
Mask.begin(), Part * VL.size()),
10824 return std::nullopt;
10828BoUpSLP::isGatherShuffledEntry(
10832 assert(NumParts > 0 && NumParts < VL.
size() &&
10833 "Expected positive number of registers.");
10836 if (TE == VectorizableTree.front().get())
10839 if (
TE->isNonPowOf2Vec())
10842 assert(
TE->UserTreeIndices.size() == 1 &&
10843 "Expected only single user of the gather node.");
10845 "Number of scalars must be divisible by NumParts.");
10846 unsigned SliceSize = VL.
size() / NumParts;
10848 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10851 std::optional<TTI::ShuffleKind> SubRes =
10852 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10855 SubEntries.
clear();
10858 SubEntries.
front()->getVectorFactor() == VL.
size() &&
10859 (SubEntries.
front()->isSame(
TE->Scalars) ||
10860 SubEntries.
front()->isSame(VL))) {
10862 LocalSubEntries.
swap(SubEntries);
10865 std::iota(
Mask.begin(),
Mask.end(), 0);
10867 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
10868 if (isa<PoisonValue>(VL[
I]))
10870 Entries.emplace_back(1, LocalSubEntries.
front());
10876 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
10884 bool ForPoisonSrc)
const {
10886 Type *ScalarTy = VL[0]->getType();
10887 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10888 ScalarTy =
SI->getValueOperand()->getType();
10890 bool DuplicateNonConst =
false;
10898 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
10905 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
10908 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
10916 EstimateInsertCost(
I, V);
10917 ShuffleMask[
I] =
I;
10921 DuplicateNonConst =
true;
10923 ShuffleMask[
I] = Res.first->second;
10929 if (DuplicateNonConst)
10931 VecTy, ShuffleMask);
10943 VLOperands Ops(VL, R);
10946 Left = Ops.getVL(0);
10947 Right = Ops.getVL(1);
10950Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
10953 return *Res.second;
10957 auto *Front = E->getMainOp();
10960 if (E->getOpcode() == Instruction::GetElementPtr &&
10961 !isa<GetElementPtrInst>(V))
10963 auto *I = cast<Instruction>(V);
10964 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10965 isVectorLikeInstWithConstOps(I);
10968 auto FindLastInst = [&]() {
10970 for (
Value *V : E->Scalars) {
10971 auto *
I = dyn_cast<Instruction>(V);
10974 if (LastInst->
getParent() ==
I->getParent()) {
10979 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10980 !isa<GetElementPtrInst>(
I)) ||
10983 "Expected vector-like or non-GEP in GEP node insts only.");
10991 auto *NodeB = DT->
getNode(
I->getParent());
10992 assert(NodeA &&
"Should only process reachable instructions");
10993 assert(NodeB &&
"Should only process reachable instructions");
10994 assert((NodeA == NodeB) ==
10995 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10996 "Different nodes should have different DFS numbers");
10997 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11004 auto FindFirstInst = [&]() {
11006 for (
Value *V : E->Scalars) {
11007 auto *
I = dyn_cast<Instruction>(V);
11010 if (FirstInst->
getParent() ==
I->getParent()) {
11011 if (
I->comesBefore(FirstInst))
11015 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11016 !isa<GetElementPtrInst>(
I)) ||
11019 "Expected vector-like or non-GEP in GEP node insts only.");
11027 auto *NodeB = DT->
getNode(
I->getParent());
11028 assert(NodeA &&
"Should only process reachable instructions");
11029 assert(NodeB &&
"Should only process reachable instructions");
11030 assert((NodeA == NodeB) ==
11031 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11032 "Different nodes should have different DFS numbers");
11033 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11042 (E->State != TreeEntry::NeedToGather &&
11044 if ((E->getOpcode() == Instruction::GetElementPtr &&
11047 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11051 return !isVectorLikeInstWithConstOps(V) &&
11052 isUsedOutsideBlock(V);
11054 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11056 return isa<ExtractElementInst, UndefValue>(V) ||
11057 areAllOperandsNonInsts(V);
11059 Res.second = FindLastInst();
11061 Res.second = FindFirstInst();
11062 return *Res.second;
11069 if (BlocksSchedules.count(BB)) {
11070 Value *
V = E->isOneOf(E->Scalars.back());
11073 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11074 if (Bundle && Bundle->isPartOfBundle())
11075 for (; Bundle; Bundle = Bundle->NextInBundle)
11076 if (Bundle->OpValue == Bundle->Inst)
11077 Res.second = Bundle->Inst;
11099 Res.second = FindLastInst();
11100 assert(Res.second &&
"Failed to find last instruction in bundle");
11101 return *Res.second;
11104void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11105 auto *Front = E->getMainOp();
11106 Instruction *LastInst = &getLastInstructionInBundle(E);
11107 assert(LastInst &&
"Failed to find last instruction in bundle");
11110 bool IsPHI = isa<PHINode>(LastInst);
11113 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11115 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11119 Builder.SetInsertPoint(
11123 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11133 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11136 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11137 InsertBB = InsertBB->getSinglePredecessor();
11138 return InsertBB && InsertBB == InstBB;
11140 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11141 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11142 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11143 getTreeEntry(Inst) ||
11144 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11145 PostponedIndices.
insert(
I).second)
11149 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11152 if (cast<VectorType>(Vec->
getType())->getElementType() != Ty) {
11154 "Expected integer types only.");
11155 Vec = Builder.CreateIntCast(
11158 cast<VectorType>(Vec->
getType())->getElementCount()),
11162 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11163 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11166 GatherShuffleExtractSeq.
insert(InsElt);
11167 CSEBlocks.
insert(InsElt->getParent());
11169 if (isa<Instruction>(V)) {
11170 if (TreeEntry *Entry = getTreeEntry(V)) {
11172 User *UserOp =
nullptr;
11174 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11180 unsigned FoundLane = Entry->findLaneForValue(V);
11181 ExternalUses.emplace_back(V, UserOp, FoundLane);
11188 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
11194 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11202 if (!isa<UndefValue>(VL[
I])) {
11206 if (isa<PoisonValue>(VL[
I]))
11208 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11213 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11216 for (
int I : NonConsts)
11217 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11220 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11221 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11259 bool IsFinalized =
false;
11272 class ShuffleIRBuilder {
11285 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11286 CSEBlocks(CSEBlocks),
DL(
DL) {}
11287 ~ShuffleIRBuilder() =
default;
11290 if (V1->
getType() != V2->getType()) {
11293 "Expected integer vector types only.");
11294 if (V1->
getType() != V2->getType()) {
11295 if (cast<VectorType>(V2->getType())
11297 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11299 ->getIntegerBitWidth())
11308 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11309 GatherShuffleExtractSeq.
insert(
I);
11310 CSEBlocks.
insert(
I->getParent());
11319 unsigned VF = Mask.size();
11320 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11324 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11325 GatherShuffleExtractSeq.
insert(
I);
11326 CSEBlocks.
insert(
I->getParent());
11330 Value *createIdentity(
Value *V) {
return V; }
11331 Value *createPoison(
Type *Ty,
unsigned VF) {
11336 void resizeToMatch(
Value *&V1,
Value *&V2) {
11337 if (V1->
getType() == V2->getType())
11339 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11340 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11341 int VF = std::max(V1VF, V2VF);
11342 int MinVF = std::min(V1VF, V2VF);
11344 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11346 Value *&
Op = MinVF == V1VF ? V1 : V2;
11348 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11349 GatherShuffleExtractSeq.
insert(
I);
11350 CSEBlocks.
insert(
I->getParent());
11363 assert(V1 &&
"Expected at least one vector value.");
11364 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11365 R.CSEBlocks, *R.DL);
11366 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11374 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11381 : Builder(Builder), R(R) {}
11385 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11386 unsigned NumParts,
bool &UseVecBaseAsInput) {
11387 UseVecBaseAsInput =
false;
11389 Value *VecBase =
nullptr;
11390 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11394 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11395 VecBase = EI->getVectorOperand();
11396 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11397 VecBase = TE->VectorizedValue;
11398 assert(VecBase &&
"Expected vectorized value.");
11399 UniqueBases.
insert(VecBase);
11402 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11404 const TreeEntry *UTE = R.getTreeEntry(U);
11405 return !UTE || R.MultiNodeScalars.contains(U) ||
11406 (isa<GetElementPtrInst>(U) &&
11407 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11408 count_if(R.VectorizableTree,
11409 [&](const std::unique_ptr<TreeEntry> &TE) {
11410 return any_of(TE->UserTreeIndices,
11411 [&](const EdgeInfo &Edge) {
11412 return Edge.UserTE == UTE;
11414 is_contained(TE->Scalars, EI);
11418 R.eraseInstruction(EI);
11420 if (NumParts == 1 || UniqueBases.
size() == 1)
11422 UseVecBaseAsInput =
true;
11432 Value *Vec =
nullptr;
11434 unsigned SliceSize = E->Scalars.size() / NumParts;
11435 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11439 constexpr int MaxBases = 2;
11447 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11448 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11449 VecOp = TE->VectorizedValue;
11450 assert(VecOp &&
"Expected vectorized value.");
11452 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
11454 assert((PrevSize ==
Size || PrevSize == 0) &&
11455 "Expected vectors of the same size.");
11458 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
11460 if (!Bases.front())
11463 if (Bases.back()) {
11464 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11465 TransformToIdentity(SubMask);
11467 SubVec = Bases.front();
11474 Mask.slice(
P * SliceSize, SliceSize);
11479 "Expected first part or all previous parts masked.");
11480 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11482 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11484 unsigned SubVecVF =
11485 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11486 VF = std::max(VF, SubVecVF);
11489 for (
int &
Idx : SubMask)
11492 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11493 Vec = createShuffle(Vec, SubVec, VecMask);
11494 TransformToIdentity(VecMask);
11502 std::optional<Value *>
11508 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
11510 return std::nullopt;
11514 E->getVectorFactor());
11522 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11527 add(E1.VectorizedValue, Mask);
11531 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
11532 if (InVectors.
empty()) {
11535 CommonMask.
assign(Mask.begin(), Mask.end());
11539 if (InVectors.
size() == 2) {
11540 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11541 transformMaskAfterShuffle(CommonMask, CommonMask);
11542 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
11544 Vec = createShuffle(Vec,
nullptr, CommonMask);
11545 transformMaskAfterShuffle(CommonMask, CommonMask);
11547 V1 = createShuffle(V1, V2, Mask);
11548 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11550 CommonMask[
Idx] =
Idx + Sz;
11551 InVectors.
front() = Vec;
11552 if (InVectors.
size() == 2)
11553 InVectors.
back() = V1;
11559 if (InVectors.
empty()) {
11560 if (!isa<FixedVectorType>(V1->
getType())) {
11561 V1 = createShuffle(V1,
nullptr, CommonMask);
11563 transformMaskAfterShuffle(CommonMask, Mask);
11566 CommonMask.
assign(Mask.begin(), Mask.end());
11569 const auto *It =
find(InVectors, V1);
11570 if (It == InVectors.
end()) {
11571 if (InVectors.
size() == 2 ||
11573 !isa<FixedVectorType>(V1->
getType())) {
11575 if (InVectors.
size() == 2) {
11576 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11577 transformMaskAfterShuffle(CommonMask, CommonMask);
11578 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11579 CommonMask.
size()) {
11580 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11581 transformMaskAfterShuffle(CommonMask, CommonMask);
11583 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11586 V->getType() != V1->
getType()
11588 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11589 ->getNumElements();
11590 if (V->getType() != V1->
getType())
11591 V1 = createShuffle(V1,
nullptr, Mask);
11592 InVectors.
front() = V;
11593 if (InVectors.
size() == 2)
11594 InVectors.
back() = V1;
11601 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11607 int VF = CommonMask.
size();
11608 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11609 VF = FTy->getNumElements();
11610 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11612 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11621 Value *Root =
nullptr) {
11622 return R.gather(VL, Root);
11631 IsFinalized =
true;
11634 if (InVectors.
size() == 2) {
11635 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11638 Vec = createShuffle(Vec,
nullptr, CommonMask);
11640 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11644 "Expected vector length for the final value before action.");
11645 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11648 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11649 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11651 Action(Vec, CommonMask);
11652 InVectors.
front() = Vec;
11654 if (!ExtMask.
empty()) {
11655 if (CommonMask.
empty()) {
11659 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11662 NewMask[
I] = CommonMask[ExtMask[
I]];
11664 CommonMask.
swap(NewMask);
11667 if (CommonMask.
empty()) {
11668 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11669 return InVectors.
front();
11671 if (InVectors.
size() == 2)
11672 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11673 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
11678 "Shuffle construction must be finalized.");
11682Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
11683 bool PostponedPHIs) {
11684 ValueList &VL = E->getOperand(NodeIdx);
11685 const unsigned VF = VL.size();
11688 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11689 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11690 if (It != VL.end())
11693 if (S.getOpcode()) {
11694 auto CheckSameVE = [&](
const TreeEntry *VE) {
11695 return VE->isSame(VL) &&
11696 (
any_of(VE->UserTreeIndices,
11697 [E, NodeIdx](
const EdgeInfo &EI) {
11698 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11700 any_of(VectorizableTree,
11701 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
11702 return TE->isOperandGatherNode({E, NodeIdx}) &&
11703 VE->isSame(TE->Scalars);
11706 TreeEntry *VE = getTreeEntry(S.OpValue);
11707 bool IsSameVE = VE && CheckSameVE(VE);
11709 auto It = MultiNodeScalars.
find(S.OpValue);
11710 if (It != MultiNodeScalars.
end()) {
11711 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
11712 return TE != VE && CheckSameVE(TE);
11714 if (
I != It->getSecond().end()) {
11722 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
11723 ShuffleBuilder.add(V, Mask);
11724 return ShuffleBuilder.finalize(std::nullopt);
11727 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
11728 if (!VE->ReuseShuffleIndices.empty()) {
11749 if (isa<PoisonValue>(V))
11751 Mask[
I] = VE->findLaneForValue(V);
11753 V = FinalShuffle(V, Mask);
11755 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
11756 "Expected vectorization factor less "
11757 "than original vector size.");
11759 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11760 V = FinalShuffle(V, UniformMask);
11766 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11767 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11768 }) == VE->UserTreeIndices.end()) {
11770 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11771 return TE->State == TreeEntry::NeedToGather &&
11772 TE->UserTreeIndices.front().UserTE == E &&
11773 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11775 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
11776 (*It)->VectorizedValue =
V;
11785 auto *
I =
find_if(VectorizableTree,
11786 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
11787 return TE->isOperandGatherNode({E, NodeIdx});
11789 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
11790 assert(
I->get()->UserTreeIndices.size() == 1 &&
11791 "Expected only single user for the gather node.");
11792 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
11796template <
typename BVTy,
typename ResTy,
typename...
Args>
11797ResTy BoUpSLP::processBuildVector(
const TreeEntry *E, Args &...Params) {
11798 assert(E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
11799 unsigned VF = E->getVectorFactor();
11801 bool NeedFreeze =
false;
11803 E->ReuseShuffleIndices.end());
11809 if (!ReorderMask.
empty())
11812 unsigned I,
unsigned SliceSize) {
11814 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11817 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11818 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11819 if (UserTE->getNumOperands() != 2)
11822 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
11823 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
11824 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11825 }) !=
TE->UserTreeIndices.end();
11827 if (It == VectorizableTree.end())
11830 if ((
Mask.size() < InputVF &&
11833 (
Mask.size() == InputVF &&
11835 std::iota(std::next(
Mask.begin(),
I * SliceSize),
11836 std::next(
Mask.begin(), (
I + 1) * SliceSize), 0);
11840 std::fill(std::next(
Mask.begin(),
I * SliceSize),
11841 std::next(
Mask.begin(), (
I + 1) * SliceSize), IVal);
11845 BVTy ShuffleBuilder(Params...);
11846 ResTy Res = ResTy();
11850 Value *ExtractVecBase =
nullptr;
11851 bool UseVecBaseAsInput =
false;
11854 Type *ScalarTy = GatheredScalars.front()->getType();
11857 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11859 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
11861 bool Resized =
false;
11863 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11864 if (!ExtractShuffles.
empty()) {
11869 if (
const auto *TE = getTreeEntry(
11870 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
11873 if (std::optional<ResTy> Delayed =
11874 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11876 PostponedGathers.
insert(E);
11881 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
11882 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11883 ExtractVecBase = VecBase;
11884 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11885 if (VF == VecBaseTy->getNumElements() &&
11886 GatheredScalars.size() != VF) {
11888 GatheredScalars.append(VF - GatheredScalars.size(),
11894 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
11895 E->isAltShuffle() ||
11896 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
11898 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11900 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11902 if (!GatherShuffles.
empty()) {
11903 if (std::optional<ResTy> Delayed =
11904 ShuffleBuilder.needToDelay(E, Entries)) {
11906 PostponedGathers.
insert(E);
11911 if (GatherShuffles.
size() == 1 &&
11913 Entries.front().front()->isSame(E->Scalars)) {
11918 <<
"SLP: perfect diamond match for gather bundle "
11921 Mask.resize(E->Scalars.size());
11922 const TreeEntry *FrontTE = Entries.front().front();
11923 if (FrontTE->ReorderIndices.empty() &&
11924 ((FrontTE->ReuseShuffleIndices.empty() &&
11925 E->Scalars.size() == FrontTE->Scalars.size()) ||
11926 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11927 std::iota(
Mask.begin(),
Mask.end(), 0);
11930 if (isa<PoisonValue>(V)) {
11934 Mask[
I] = FrontTE->findLaneForValue(V);
11937 ShuffleBuilder.add(*FrontTE, Mask);
11938 Res = ShuffleBuilder.finalize(E->getCommonMask());
11942 if (GatheredScalars.size() != VF &&
11944 return any_of(TEs, [&](
const TreeEntry *TE) {
11945 return TE->getVectorFactor() == VF;
11948 GatheredScalars.append(VF - GatheredScalars.size(),
11952 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
11960 bool IsRootPoison) {
11963 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
11970 int NumNonConsts = 0;
11973 if (isa<UndefValue>(V)) {
11974 if (!isa<PoisonValue>(V)) {
11989 Scalars.
front() = OrigV;
11992 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
11993 Scalars[Res.first->second] = OrigV;
11994 ReuseMask[
I] = Res.first->second;
11997 if (NumNonConsts == 1) {
12002 if (!UndefPos.
empty() && UndefPos.
front() == 0)
12005 ReuseMask[SinglePos] = SinglePos;
12006 }
else if (!UndefPos.
empty() && IsSplat) {
12011 return !isa<UndefValue>(V) &&
12013 (E->UserTreeIndices.size() == 1 &&
12017 return E->UserTreeIndices.front().EdgeIdx !=
12018 U.getOperandNo() &&
12020 E->UserTreeIndices.front().UserTE->Scalars,
12024 if (It != Scalars.
end()) {
12026 int Pos = std::distance(Scalars.
begin(), It);
12027 for (
int I : UndefPos) {
12029 ReuseMask[
I] = Pos;
12038 for (
int I : UndefPos) {
12040 if (isa<UndefValue>(Scalars[
I]))
12047 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12048 bool IsNonPoisoned =
true;
12049 bool IsUsedInExpr =
true;
12050 Value *Vec1 =
nullptr;
12051 if (!ExtractShuffles.
empty()) {
12055 Value *Vec2 =
nullptr;
12056 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12060 if (UseVecBaseAsInput) {
12061 Vec1 = ExtractVecBase;
12063 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12066 if (isa<UndefValue>(E->Scalars[
I]))
12068 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12069 Value *VecOp = EI->getVectorOperand();
12070 if (
const auto *TE = getTreeEntry(VecOp))
12071 if (
TE->VectorizedValue)
12072 VecOp =
TE->VectorizedValue;
12075 }
else if (Vec1 != VecOp) {
12076 assert((!Vec2 || Vec2 == VecOp) &&
12077 "Expected only 1 or 2 vectors shuffle.");
12083 IsUsedInExpr =
false;
12086 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12088 IsUsedInExpr &= FindReusedSplat(
12090 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12091 ExtractMask.size());
12092 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12095 IsUsedInExpr =
false;
12097 ScalarTy, GatheredScalars.size())),
12098 ExtractMask,
true);
12101 if (!GatherShuffles.
empty()) {
12102 unsigned SliceSize = E->Scalars.size() / NumParts;
12104 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12107 "No shuffles with empty entries list expected.");
12111 "Expected shuffle of 1 or 2 entries.");
12114 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12115 if (TEs.
size() == 1) {
12116 IsUsedInExpr &= FindReusedSplat(
12117 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12118 ShuffleBuilder.add(*TEs.
front(), VecMask);
12119 if (TEs.
front()->VectorizedValue)
12123 IsUsedInExpr =
false;
12124 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12125 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12136 int EMSz = ExtractMask.size();
12137 int MSz =
Mask.size();
12140 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12141 bool IsIdentityShuffle =
12142 ((UseVecBaseAsInput ||
12144 [](
const std::optional<TTI::ShuffleKind> &SK) {
12148 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12150 (!GatherShuffles.
empty() &&
12152 [](
const std::optional<TTI::ShuffleKind> &SK) {
12156 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12158 bool EnoughConstsForShuffle =
12162 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12166 return isa<Constant>(V) && !isa<UndefValue>(V);
12168 (!IsIdentityShuffle ||
12169 (GatheredScalars.size() == 2 &&
12171 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12173 return isa<Constant>(V) && !isa<PoisonValue>(V);
12177 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12178 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12184 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12186 TryPackScalars(GatheredScalars, BVMask,
true);
12187 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12188 ShuffleBuilder.add(BV, BVMask);
12191 return isa<PoisonValue>(V) ||
12192 (IsSingleShuffle && ((IsIdentityShuffle &&
12193 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12195 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12197 Res = ShuffleBuilder.finalize(
12198 E->ReuseShuffleIndices, E->Scalars.size(),
12200 TryPackScalars(NonConstants, Mask,
false);
12201 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12206 TryPackScalars(GatheredScalars, ReuseMask,
true);
12207 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12208 ShuffleBuilder.add(BV, ReuseMask);
12209 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12214 if (!isa<PoisonValue>(V))
12217 Value *BV = ShuffleBuilder.gather(E->Scalars);
12218 ShuffleBuilder.add(BV, Mask);
12219 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12223 Res = ShuffleBuilder.createFreeze(Res);
12227Value *BoUpSLP::createBuildVector(
const TreeEntry *E) {
12228 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
12235 if (E->VectorizedValue &&
12236 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12237 E->isAltShuffle())) {
12238 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12239 return E->VectorizedValue;
12242 if (E->State == TreeEntry::NeedToGather) {
12244 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12245 setInsertPointAfterBundle(E);
12246 Value *Vec = createBuildVector(E);
12247 E->VectorizedValue = Vec;
12252 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12253 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
12254 if (E->getOpcode() == Instruction::Store) {
12256 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12257 E->ReorderIndices.size());
12258 ShuffleBuilder.add(V, Mask);
12259 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12260 ShuffleBuilder.addOrdered(V, std::nullopt);
12262 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12264 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12267 assert((E->State == TreeEntry::Vectorize ||
12268 E->State == TreeEntry::ScatterVectorize ||
12269 E->State == TreeEntry::StridedVectorize) &&
12270 "Unhandled state");
12271 unsigned ShuffleOrOp =
12272 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12275 if (
auto *Store = dyn_cast<StoreInst>(VL0))
12276 ScalarTy =
Store->getValueOperand()->getType();
12277 else if (
auto *IE = dyn_cast<InsertElementInst>(VL0))
12278 ScalarTy =
IE->getOperand(1)->getType();
12279 auto It = MinBWs.
find(E);
12280 if (It != MinBWs.
end())
12282 auto GetOperandSignedness = [&](
unsigned Idx) {
12283 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12284 bool IsSigned =
false;
12285 auto It = MinBWs.
find(OpE);
12286 if (It != MinBWs.
end())
12287 IsSigned = It->second.second;
12290 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12295 switch (ShuffleOrOp) {
12296 case Instruction::PHI: {
12297 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12298 E != VectorizableTree.front().get() ||
12299 !E->UserTreeIndices.empty()) &&
12300 "PHI reordering is free.");
12301 if (PostponedPHIs && E->VectorizedValue)
12302 return E->VectorizedValue;
12303 auto *PH = cast<PHINode>(VL0);
12305 PH->getParent()->getFirstNonPHIIt());
12307 if (PostponedPHIs || !E->VectorizedValue) {
12314 PH->getParent()->getFirstInsertionPt());
12317 V = FinalShuffle(V, E, VecTy);
12319 E->VectorizedValue =
V;
12323 PHINode *NewPhi = cast<PHINode>(E->PHI);
12332 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12338 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12342 if (!VisitedBBs.
insert(IBB).second) {
12349 Value *Vec = vectorizeOperand(E,
I,
true);
12350 if (VecTy != Vec->
getType()) {
12352 getOperandEntry(E,
I)->State == TreeEntry::NeedToGather ||
12353 MinBWs.
contains(getOperandEntry(E,
I))) &&
12354 "Expected item in MinBWs.");
12355 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12361 "Invalid number of incoming values");
12365 case Instruction::ExtractElement: {
12366 Value *
V = E->getSingleOperand(0);
12367 if (
const TreeEntry *TE = getTreeEntry(V))
12368 V =
TE->VectorizedValue;
12369 setInsertPointAfterBundle(E);
12370 V = FinalShuffle(V, E, VecTy);
12371 E->VectorizedValue =
V;
12374 case Instruction::ExtractValue: {
12375 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12380 NewV = FinalShuffle(NewV, E, VecTy);
12381 E->VectorizedValue = NewV;
12384 case Instruction::InsertElement: {
12385 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12387 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12389 Type *ScalarTy =
Op.front()->getType();
12390 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12392 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12393 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12398 cast<FixedVectorType>(
V->getType())->getNumElements()),
12403 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12404 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12406 const unsigned NumElts =
12407 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12408 const unsigned NumScalars = E->Scalars.size();
12411 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12415 if (!E->ReorderIndices.empty()) {
12420 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12423 bool IsIdentity =
true;
12425 Mask.swap(PrevMask);
12426 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12429 IsIdentity &= InsertIdx -
Offset ==
I;
12432 if (!IsIdentity || NumElts != NumScalars) {
12436 if (NumElts != NumScalars &&
Offset == 0) {
12445 InsertMask[*InsertIdx] = *InsertIdx;
12446 if (!
Ins->hasOneUse())
12448 Ins = dyn_cast_or_null<InsertElementInst>(
12449 Ins->getUniqueUndroppableUser());
12452 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12454 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12457 if (!IsFirstPoison.
all()) {
12459 for (
unsigned I = 0;
I < NumElts;
I++) {
12461 IsFirstUndef.
test(
I)) {
12462 if (IsVNonPoisonous) {
12463 InsertMask[
I] =
I < NumScalars ?
I : 0;
12468 if (
Idx >= NumScalars)
12469 Idx = NumScalars - 1;
12470 InsertMask[
I] = NumScalars +
Idx;
12484 if (
auto *
I = dyn_cast<Instruction>(V)) {
12485 GatherShuffleExtractSeq.
insert(
I);
12486 CSEBlocks.
insert(
I->getParent());
12491 for (
unsigned I = 0;
I < NumElts;
I++) {
12496 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12499 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
12500 NumElts != NumScalars) {
12501 if (IsFirstUndef.
all()) {
12504 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12505 if (!IsFirstPoison.
all()) {
12506 for (
unsigned I = 0;
I < NumElts;
I++) {
12508 InsertMask[
I] =
I + NumElts;
12515 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
12516 if (
auto *
I = dyn_cast<Instruction>(V)) {
12517 GatherShuffleExtractSeq.
insert(
I);
12518 CSEBlocks.
insert(
I->getParent());
12523 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12524 for (
unsigned I = 0;
I < NumElts;
I++) {
12528 InsertMask[
I] += NumElts;
12531 FirstInsert->getOperand(0), V, InsertMask,
12532 cast<Instruction>(E->Scalars.back())->getName());
12533 if (
auto *
I = dyn_cast<Instruction>(V)) {
12534 GatherShuffleExtractSeq.
insert(
I);
12535 CSEBlocks.
insert(
I->getParent());
12540 ++NumVectorInstructions;
12541 E->VectorizedValue =
V;
12544 case Instruction::ZExt:
12545 case Instruction::SExt:
12546 case Instruction::FPToUI:
12547 case Instruction::FPToSI:
12548 case Instruction::FPExt:
12549 case Instruction::PtrToInt:
12550 case Instruction::IntToPtr:
12551 case Instruction::SIToFP:
12552 case Instruction::UIToFP:
12553 case Instruction::Trunc:
12554 case Instruction::FPTrunc:
12555 case Instruction::BitCast: {
12556 setInsertPointAfterBundle(E);
12558 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12559 if (E->VectorizedValue) {
12560 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12561 return E->VectorizedValue;
12564 auto *CI = cast<CastInst>(VL0);
12566 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
12567 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
12569 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
12572 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12573 if (SrcIt != MinBWs.
end())
12574 SrcBWSz = SrcIt->second.first;
12575 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12576 if (BWSz == SrcBWSz) {
12577 VecOpcode = Instruction::BitCast;
12578 }
else if (BWSz < SrcBWSz) {
12579 VecOpcode = Instruction::Trunc;
12580 }
else if (It != MinBWs.
end()) {
12581 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12582 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12583 }
else if (SrcIt != MinBWs.
end()) {
12584 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12586 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12588 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
12589 !SrcIt->second.second) {
12590 VecOpcode = Instruction::UIToFP;
12592 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12594 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12595 V = FinalShuffle(V, E, VecTy);
12597 E->VectorizedValue =
V;
12598 ++NumVectorInstructions;
12601 case Instruction::FCmp:
12602 case Instruction::ICmp: {
12603 setInsertPointAfterBundle(E);
12605 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
12606 if (E->VectorizedValue) {
12607 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12608 return E->VectorizedValue;
12610 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
12611 if (E->VectorizedValue) {
12612 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12613 return E->VectorizedValue;
12615 if (
L->getType() !=
R->getType()) {
12616 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12617 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12618 MinBWs.
contains(getOperandEntry(E, 0)) ||
12619 MinBWs.
contains(getOperandEntry(E, 1))) &&
12620 "Expected item in MinBWs.");
12621 if (cast<VectorType>(
L->getType())
12623 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
12625 ->getIntegerBitWidth()) {
12626 Type *CastTy =
R->getType();
12629 Type *CastTy =
L->getType();
12638 VecTy = cast<FixedVectorType>(
V->getType());
12639 V = FinalShuffle(V, E, VecTy);
12641 E->VectorizedValue =
V;
12642 ++NumVectorInstructions;
12645 case Instruction::Select: {
12646 setInsertPointAfterBundle(E);
12648 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
12649 if (E->VectorizedValue) {
12650 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12651 return E->VectorizedValue;
12653 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12654 if (E->VectorizedValue) {
12655 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12656 return E->VectorizedValue;
12658 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12659 if (E->VectorizedValue) {
12660 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12661 return E->VectorizedValue;
12665 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12666 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12667 MinBWs.
contains(getOperandEntry(E, 1)) ||
12668 MinBWs.
contains(getOperandEntry(E, 2))) &&
12669 "Expected item in MinBWs.");
12670 if (True->
getType() != VecTy)
12671 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
12672 if (False->
getType() != VecTy)
12673 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
12677 V = FinalShuffle(V, E, VecTy);
12679 E->VectorizedValue =
V;
12680 ++NumVectorInstructions;
12683 case Instruction::FNeg: {
12684 setInsertPointAfterBundle(E);
12686 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
12688 if (E->VectorizedValue) {
12689 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12690 return E->VectorizedValue;
12696 if (
auto *
I = dyn_cast<Instruction>(V))
12699 V = FinalShuffle(V, E, VecTy);
12701 E->VectorizedValue =
V;
12702 ++NumVectorInstructions;
12706 case Instruction::Add:
12707 case Instruction::FAdd:
12708 case Instruction::Sub:
12709 case Instruction::FSub:
12710 case Instruction::Mul:
12711 case Instruction::FMul:
12712 case Instruction::UDiv:
12713 case Instruction::SDiv:
12714 case Instruction::FDiv:
12715 case Instruction::URem:
12716 case Instruction::SRem:
12717 case Instruction::FRem:
12718 case Instruction::Shl:
12719 case Instruction::LShr:
12720 case Instruction::AShr:
12721 case Instruction::And:
12722 case Instruction::Or:
12723 case Instruction::Xor: {
12724 setInsertPointAfterBundle(E);
12726 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
12727 if (E->VectorizedValue) {
12728 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12729 return E->VectorizedValue;
12731 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
12732 if (E->VectorizedValue) {
12733 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12734 return E->VectorizedValue;
12738 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12739 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12740 MinBWs.
contains(getOperandEntry(E, 0)) ||
12741 MinBWs.
contains(getOperandEntry(E, 1))) &&
12742 "Expected item in MinBWs.");
12753 if (
auto *
I = dyn_cast<Instruction>(V)) {
12756 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
12758 return isCommutative(cast<Instruction>(V));
12760 I->setHasNoUnsignedWrap(
false);
12763 V = FinalShuffle(V, E, VecTy);
12765 E->VectorizedValue =
V;
12766 ++NumVectorInstructions;
12770 case Instruction::Load: {
12773 setInsertPointAfterBundle(E);
12775 LoadInst *LI = cast<LoadInst>(VL0);
12778 if (E->State == TreeEntry::Vectorize) {
12780 }
else if (E->State == TreeEntry::StridedVectorize) {
12781 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12782 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12783 PO = IsReverseOrder ? PtrN : Ptr0;
12789 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
12791 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12792 DL->getTypeAllocSize(ScalarTy));
12796 return cast<LoadInst>(V)->getPointerOperand();
12799 std::optional<Value *> Stride =
12808 (IsReverseOrder ? -1 : 1) *
12809 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
12811 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12813 Intrinsic::experimental_vp_strided_load,
12814 {VecTy, PO->
getType(), StrideTy},
12816 Builder.
getInt32(E->Scalars.size())});
12822 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
12823 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12824 if (E->VectorizedValue) {
12825 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12826 return E->VectorizedValue;
12829 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12834 V = FinalShuffle(V, E, VecTy);
12835 E->VectorizedValue =
V;
12836 ++NumVectorInstructions;
12839 case Instruction::Store: {
12840 auto *
SI = cast<StoreInst>(VL0);
12842 setInsertPointAfterBundle(E);
12844 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12845 if (VecValue->
getType() != VecTy)
12847 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12848 VecValue = FinalShuffle(VecValue, E, VecTy);
12856 E->VectorizedValue =
V;
12857 ++NumVectorInstructions;
12860 case Instruction::GetElementPtr: {
12861 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12862 setInsertPointAfterBundle(E);
12864 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12865 if (E->VectorizedValue) {
12866 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12867 return E->VectorizedValue;
12871 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
12872 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12873 if (E->VectorizedValue) {
12874 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12875 return E->VectorizedValue;
12880 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12881 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
12883 for (
Value *V : E->Scalars) {
12884 if (isa<GetElementPtrInst>(V))
12890 V = FinalShuffle(V, E, VecTy);
12892 E->VectorizedValue =
V;
12893 ++NumVectorInstructions;
12897 case Instruction::Call: {
12898 CallInst *CI = cast<CallInst>(VL0);
12899 setInsertPointAfterBundle(E);
12905 It != MinBWs.
end() ? It->second.first : 0);
12908 VecCallCosts.first <= VecCallCosts.second;
12910 Value *ScalarArg =
nullptr;
12916 auto *CEI = cast<CallInst>(VL0);
12917 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
12922 ScalarArg = CEI->getArgOperand(
I);
12925 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
12926 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
12934 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
12935 if (E->VectorizedValue) {
12936 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12937 return E->VectorizedValue;
12939 ScalarArg = CEI->getArgOperand(
I);
12940 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
12942 It == MinBWs.
end()) {
12945 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
12946 }
else if (It != MinBWs.
end()) {
12947 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
12956 if (!UseIntrinsic) {
12972 V = FinalShuffle(V, E, VecTy);
12974 E->VectorizedValue =
V;
12975 ++NumVectorInstructions;
12978 case Instruction::ShuffleVector: {
12979 assert(E->isAltShuffle() &&
12984 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12985 "Invalid Shuffle Vector Operand");
12989 setInsertPointAfterBundle(E);
12990 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12991 if (E->VectorizedValue) {
12992 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12993 return E->VectorizedValue;
12995 RHS = vectorizeOperand(E, 1, PostponedPHIs);
12997 setInsertPointAfterBundle(E);
12998 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13000 if (E->VectorizedValue) {
13001 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
13002 return E->VectorizedValue;
13009 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13010 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13011 MinBWs.
contains(getOperandEntry(E, 0)) ||
13012 MinBWs.
contains(getOperandEntry(E, 1))) &&
13013 "Expected item in MinBWs.");
13014 Type *CastTy = VecTy;
13018 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
13020 ->getIntegerBitWidth())
13037 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13038 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13039 auto *AltCI = cast<CmpInst>(E->getAltOp());
13041 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13044 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13045 cast<VectorType>(
LHS->
getType())->getElementType());
13046 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13047 if (BWSz <= SrcBWSz) {
13048 if (BWSz < SrcBWSz)
13051 if (
auto *
I = dyn_cast<Instruction>(LHS))
13053 E->VectorizedValue =
LHS;
13054 ++NumVectorInstructions;
13065 for (
Value *V : {V0, V1}) {
13066 if (
auto *
I = dyn_cast<Instruction>(V)) {
13067 GatherShuffleExtractSeq.
insert(
I);
13068 CSEBlocks.
insert(
I->getParent());
13077 E->buildAltOpShuffleMask(
13079 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13083 Mask, &OpScalars, &AltScalars);
13087 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13089 if (
auto *
I = dyn_cast<Instruction>(Vec);
13090 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13092 auto *IV = cast<Instruction>(V);
13093 return IV->getOpcode() == Instruction::Sub &&
13094 isCommutative(cast<Instruction>(IV));
13096 I->setHasNoUnsignedWrap(
false);
13098 DropNuwFlag(V0, E->getOpcode());
13099 DropNuwFlag(V1, E->getAltOpcode());
13102 if (
auto *
I = dyn_cast<Instruction>(V)) {
13104 GatherShuffleExtractSeq.
insert(
I);
13105 CSEBlocks.
insert(
I->getParent());
13108 E->VectorizedValue =
V;
13109 ++NumVectorInstructions;
13122 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13128struct ShuffledInsertData {
13141 for (
auto &BSIter : BlocksSchedules) {
13142 scheduleBlock(BSIter.second.get());
13146 EntryToLastInstruction.
clear();
13156 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13157 if (TE->State == TreeEntry::Vectorize &&
13158 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13159 TE->VectorizedValue)
13165 for (
const TreeEntry *E : PostponedNodes) {
13166 auto *TE =
const_cast<TreeEntry *
>(E);
13167 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13168 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13169 TE->UserTreeIndices.front().EdgeIdx)))
13173 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13174 TE->VectorizedValue =
nullptr;
13176 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13185 if (isa<PHINode>(UserI)) {
13188 for (
User *U : PrevVec->users()) {
13191 auto *UI = dyn_cast<Instruction>(U);
13192 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13194 if (UI->comesBefore(InsertPt))
13203 if (Vec->
getType() != PrevVec->getType()) {
13205 PrevVec->getType()->isIntOrIntVectorTy() &&
13206 "Expected integer vector types only.");
13207 std::optional<bool> IsSigned;
13208 for (
Value *V : TE->Scalars) {
13209 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
13210 auto It = MinBWs.
find(BaseTE);
13211 if (It != MinBWs.
end()) {
13212 IsSigned = IsSigned.value_or(
false) || It->second.second;
13216 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
13217 auto It = MinBWs.
find(MNTE);
13218 if (It != MinBWs.
end()) {
13219 IsSigned = IsSigned.value_or(
false) || It->second.second;
13224 if (IsSigned.value_or(
false))
13227 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13228 auto It = MinBWs.
find(BVE);
13229 if (It != MinBWs.
end()) {
13230 IsSigned = IsSigned.value_or(
false) || It->second.second;
13235 if (IsSigned.value_or(
false))
13237 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
13239 IsSigned.value_or(
false) ||
13243 if (IsSigned.value_or(
false))
13247 if (IsSigned.value_or(
false)) {
13249 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
13250 if (It != MinBWs.
end())
13251 IsSigned = It->second.second;
13254 "Expected user node or perfect diamond match in MinBWs.");
13258 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13261 auto It = PostponedValues.
find(PrevVec);
13262 if (It != PostponedValues.
end()) {
13263 for (TreeEntry *VTE : It->getSecond())
13264 VTE->VectorizedValue = Vec;
13284 for (
const auto &ExternalUse : ExternalUses) {
13285 Value *Scalar = ExternalUse.Scalar;
13292 TreeEntry *E = getTreeEntry(Scalar);
13293 assert(E &&
"Invalid scalar");
13294 assert(E->State != TreeEntry::NeedToGather &&
13295 "Extracting from a gather list");
13297 if (E->getOpcode() == Instruction::GetElementPtr &&
13298 !isa<GetElementPtrInst>(Scalar))
13301 Value *Vec = E->VectorizedValue;
13302 assert(Vec &&
"Can't find vectorizable value");
13305 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13306 if (Scalar->getType() != Vec->
getType()) {
13307 Value *Ex =
nullptr;
13308 Value *ExV =
nullptr;
13309 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13311 auto It = ScalarToEEs.find(Scalar);
13312 if (It != ScalarToEEs.end()) {
13316 if (EEIt != It->second.end()) {
13322 if (
auto *CI = EEIt->second.second)
13326 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13331 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13332 Value *V = ES->getVectorOperand();
13333 if (
const TreeEntry *ETE = getTreeEntry(V))
13334 V = ETE->VectorizedValue;
13336 }
else if (ReplaceGEP) {
13339 auto *CloneGEP =
GEP->clone();
13340 if (isa<Instruction>(Vec))
13344 CloneGEP->insertBefore(
GEP);
13345 if (
GEP->hasName())
13346 CloneGEP->takeName(
GEP);
13354 if (Scalar->getType() != Ex->
getType())
13356 MinBWs.
find(E)->second.second);
13357 if (
auto *
I = dyn_cast<Instruction>(Ex))
13358 ScalarToEEs[Scalar].try_emplace(
13360 std::make_pair(
I, cast<Instruction>(ExV)));
13364 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13365 GatherShuffleExtractSeq.
insert(ExI);
13366 CSEBlocks.
insert(ExI->getParent());
13370 assert(isa<FixedVectorType>(Scalar->getType()) &&
13371 isa<InsertElementInst>(Scalar) &&
13372 "In-tree scalar of vector type is not insertelement?");
13373 auto *IE = cast<InsertElementInst>(Scalar);
13381 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13386 if (ExternalUsesAsGEPs.contains(U))
13388 TreeEntry *UseEntry = getTreeEntry(U);
13390 (UseEntry->State == TreeEntry::Vectorize ||
13392 TreeEntry::StridedVectorize) &&
13393 (E->State == TreeEntry::Vectorize ||
13394 E->State == TreeEntry::StridedVectorize) &&
13395 doesInTreeUserNeedToExtract(
13397 cast<Instruction>(UseEntry->Scalars.front()),
13400 "Scalar with nullptr User must be registered in "
13401 "ExternallyUsedValues map or remain as scalar in vectorized "
13403 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13404 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13406 PHI->getParent()->getFirstNonPHIIt());
13409 std::next(VecI->getIterator()));
13413 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13415 Scalar->replaceAllUsesWith(NewInst);
13416 ReplacedExternals.emplace_back(Scalar, NewInst);
13420 if (
auto *VU = dyn_cast<InsertElementInst>(
User)) {
13422 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13423 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13424 if (!UsedInserts.
insert(VU).second)
13427 auto BWIt = MinBWs.
find(E);
13429 auto *ScalarTy = FTy->getElementType();
13430 auto Key = std::make_pair(Vec, ScalarTy);
13431 auto VecIt = VectorCasts.
find(Key);
13432 if (VecIt == VectorCasts.
end()) {
13434 if (
auto *IVec = dyn_cast<Instruction>(Vec))
13440 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
13441 BWIt->second.second);
13444 Vec = VecIt->second;
13451 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
13458 unsigned Idx = *InsertIdx;
13459 if (It == ShuffledInserts.
end()) {
13461 It = std::next(ShuffledInserts.
begin(),
13462 ShuffledInserts.
size() - 1);
13468 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
13469 if (IEBase !=
User &&
13470 (!IEBase->hasOneUse() ||
13474 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
13476 IEBase = cast<InsertElementInst>(
Base);
13479 "InsertElementInstruction used already.");
13480 Mask[IEIdx] = IEIdx;
13481 Base = IEBase->getOperand(0);
13482 }
while (E == getTreeEntry(
Base));
13485 Base = cast<InsertElementInst>(
Base)->getOperand(0);
13489 auto It = VectorToInsertElement.
find(
Base);
13490 if (It != VectorToInsertElement.
end())
13497 Mask[
Idx] = ExternalUse.Lane;
13498 It->InsertElements.push_back(cast<InsertElementInst>(
User));
13507 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13509 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13510 if (PH->getIncomingValue(
I) == Scalar) {
13512 PH->getIncomingBlock(
I)->getTerminator();
13513 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13515 std::next(VecI->getIterator()));
13519 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13520 PH->setOperand(
I, NewInst);
13525 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13530 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13540 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13541 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
13543 CombinedMask1[
I] = Mask[
I];
13545 CombinedMask2[
I] = Mask[
I] - VF;
13548 ShuffleBuilder.
add(V1, CombinedMask1);
13550 ShuffleBuilder.
add(V2, CombinedMask2);
13551 return ShuffleBuilder.
finalize(std::nullopt);
13555 bool ForSingleMask) {
13556 unsigned VF = Mask.size();
13557 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
13559 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
13560 Vec = CreateShuffle(Vec,
nullptr, Mask);
13561 return std::make_pair(Vec,
true);
13563 if (!ForSingleMask) {
13565 for (
unsigned I = 0;
I < VF; ++
I) {
13567 ResizeMask[Mask[
I]] = Mask[
I];
13569 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
13573 return std::make_pair(Vec,
false);
13577 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
13583 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
13584 Value *NewInst = performExtractsShuffleAction<Value>(
13588 return cast<VectorType>(Vec->getType())
13589 ->getElementCount()
13590 .getKnownMinValue();
13595 assert((Vals.size() == 1 || Vals.size() == 2) &&
13596 "Expected exactly 1 or 2 input values.");
13597 if (Vals.size() == 1) {
13600 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13601 ->getNumElements() ||
13602 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13603 return CreateShuffle(Vals.front(), nullptr, Mask);
13604 return Vals.front();
13606 return CreateShuffle(Vals.
front() ? Vals.
front()
13608 Vals.
back(), Mask);
13610 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
13613 if (It != ShuffledInserts[
I].InsertElements.
rend())
13616 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
13617 assert(II &&
"Must be an insertelement instruction.");
13621 Inserts.
push_back(cast<Instruction>(II));
13622 II = dyn_cast<InsertElementInst>(II->
getOperand(0));
13626 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
13633 IE->replaceUsesOfWith(IE->getOperand(0),
13635 IE->replaceUsesOfWith(IE->getOperand(1),
13639 CSEBlocks.
insert(LastInsert->getParent());
13644 for (
auto &TEPtr : VectorizableTree) {
13645 TreeEntry *Entry = TEPtr.get();
13648 if (Entry->State == TreeEntry::NeedToGather)
13651 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
13654 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13655 Value *Scalar = Entry->Scalars[Lane];
13657 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13658 !isa<GetElementPtrInst>(Scalar))
13661 Type *Ty = Scalar->getType();
13663 for (
User *U : Scalar->users()) {
13667 assert((getTreeEntry(U) ||
13668 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13669 (isa_and_nonnull<Instruction>(U) &&
13670 isDeleted(cast<Instruction>(U)))) &&
13671 "Deleting out-of-tree value");
13675 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
13680 RemovedInsts.
push_back(cast<Instruction>(Scalar));
13686 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13687 V->mergeDIAssignID(RemovedInsts);
13690 InstrElementSize.
clear();
13692 const TreeEntry &RootTE = *VectorizableTree.front().get();
13693 Value *Vec = RootTE.VectorizedValue;
13694 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13695 It != MinBWs.end() &&
13696 ReductionBitWidth != It->second.first) {
13699 ReductionRoot->getIterator());
13703 cast<VectorType>(Vec->
getType())->getElementCount()),
13704 It->second.second);
13711 <<
" gather sequences instructions.\n");
13718 Loop *L = LI->getLoopFor(
I->getParent());
13723 BasicBlock *PreHeader = L->getLoopPreheader();
13731 auto *OpI = dyn_cast<Instruction>(V);
13732 return OpI && L->contains(OpI);
13738 CSEBlocks.
insert(PreHeader);
13753 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
13754 "Different nodes should have different DFS numbers");
13755 return A->getDFSNumIn() <
B->getDFSNumIn();
13765 if (I1->getType() != I2->getType())
13767 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13768 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13770 return I1->isIdenticalTo(I2);
13771 if (SI1->isIdenticalTo(SI2))
13773 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
13774 if (SI1->getOperand(
I) != SI2->getOperand(
I))
13777 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13781 unsigned LastUndefsCnt = 0;
13782 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
13788 NewMask[
I] != SM1[
I])
13791 NewMask[
I] = SM1[
I];
13795 return SM1.
size() - LastUndefsCnt > 1 &&
13799 SM1.
size() - LastUndefsCnt));
13805 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
13808 "Worklist not sorted properly!");
13814 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13815 !GatherShuffleExtractSeq.contains(&In))
13820 bool Replaced =
false;
13823 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13824 DT->
dominates(V->getParent(), In.getParent())) {
13825 In.replaceAllUsesWith(V);
13827 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
13828 if (!NewMask.
empty())
13829 SI->setShuffleMask(NewMask);
13833 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13834 GatherShuffleExtractSeq.contains(V) &&
13835 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13836 DT->
dominates(In.getParent(), V->getParent())) {
13838 V->replaceAllUsesWith(&In);
13840 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13841 if (!NewMask.
empty())
13842 SI->setShuffleMask(NewMask);
13850 Visited.push_back(&In);
13855 GatherShuffleExtractSeq.clear();
13858BoUpSLP::ScheduleData *
13860 ScheduleData *Bundle =
nullptr;
13861 ScheduleData *PrevInBundle =
nullptr;
13862 for (
Value *V : VL) {
13865 ScheduleData *BundleMember = getScheduleData(V);
13867 "no ScheduleData for bundle member "
13868 "(maybe not in same basic block)");
13869 assert(BundleMember->isSchedulingEntity() &&
13870 "bundle member already part of other bundle");
13871 if (PrevInBundle) {
13872 PrevInBundle->NextInBundle = BundleMember;
13874 Bundle = BundleMember;
13878 BundleMember->FirstInBundle = Bundle;
13879 PrevInBundle = BundleMember;
13881 assert(Bundle &&
"Failed to find schedule bundle");
13887std::optional<BoUpSLP::ScheduleData *>
13889 const InstructionsState &S) {
13900 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
13901 ScheduleData *Bundle) {
13907 if (ScheduleEnd != OldScheduleEnd) {
13908 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
13909 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
13914 <<
" in block " << BB->
getName() <<
"\n");
13915 calculateDependencies(Bundle,
true, SLP);
13920 initialFillReadyList(ReadyInsts);
13927 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13928 !ReadyInsts.empty()) {
13929 ScheduleData *Picked = ReadyInsts.pop_back_val();
13930 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13931 "must be ready to schedule");
13932 schedule(Picked, ReadyInsts);
13938 for (
Value *V : VL) {
13941 if (!extendSchedulingRegion(V, S)) {
13948 TryScheduleBundleImpl(
false,
nullptr);
13949 return std::nullopt;
13953 bool ReSchedule =
false;
13954 for (
Value *V : VL) {
13957 ScheduleData *BundleMember = getScheduleData(V);
13959 "no ScheduleData for bundle member (maybe not in same basic block)");
13963 ReadyInsts.remove(BundleMember);
13965 if (!BundleMember->IsScheduled)
13970 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
13971 <<
" was already scheduled\n");
13975 auto *Bundle = buildBundle(VL);
13976 TryScheduleBundleImpl(ReSchedule, Bundle);
13977 if (!Bundle->isReady()) {
13978 cancelScheduling(VL, S.OpValue);
13979 return std::nullopt;
13992 ScheduleData *Bundle = getScheduleData(OpValue);
13993 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
13994 assert(!Bundle->IsScheduled &&
13995 "Can't cancel bundle which is already scheduled");
13996 assert(Bundle->isSchedulingEntity() &&
13998 "tried to unbundle something which is not a bundle");
14001 if (Bundle->isReady())
14002 ReadyInsts.remove(Bundle);
14005 ScheduleData *BundleMember = Bundle;
14006 while (BundleMember) {
14007 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
14008 BundleMember->FirstInBundle = BundleMember;
14009 ScheduleData *Next = BundleMember->NextInBundle;
14010 BundleMember->NextInBundle =
nullptr;
14011 BundleMember->TE =
nullptr;
14012 if (BundleMember->unscheduledDepsInBundle() == 0) {
14013 ReadyInsts.insert(BundleMember);
14015 BundleMember = Next;
14019BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14021 if (ChunkPos >= ChunkSize) {
14022 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14025 return &(ScheduleDataChunks.back()[ChunkPos++]);
14028bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
14029 const InstructionsState &S) {
14030 if (getScheduleData(V,
isOneOf(S, V)))
14033 assert(
I &&
"bundle member must be an instruction");
14036 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14038 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
14039 ScheduleData *ISD = getScheduleData(
I);
14042 assert(isInSchedulingRegion(ISD) &&
14043 "ScheduleData not in scheduling region");
14044 ScheduleData *SD = allocateScheduleDataChunks();
14046 SD->init(SchedulingRegionID, S.OpValue);
14047 ExtraScheduleDataMap[
I][S.OpValue] = SD;
14050 if (CheckScheduleForI(
I))
14052 if (!ScheduleStart) {
14054 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
14056 ScheduleEnd =
I->getNextNode();
14058 CheckScheduleForI(
I);
14059 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14060 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
14068 ++ScheduleStart->getIterator().getReverse();
14073 if (
auto *II = dyn_cast<IntrinsicInst>(&
I))
14074 return II->isAssumeLikeIntrinsic();
14077 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14078 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14079 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
14081 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14082 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
14089 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14090 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14092 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
14093 assert(
I->getParent() == ScheduleStart->getParent() &&
14094 "Instruction is in wrong basic block.");
14095 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
14098 CheckScheduleForI(
I);
14103 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
14104 "Expected to reach top of the basic block or instruction down the "
14106 assert(
I->getParent() == ScheduleEnd->getParent() &&
14107 "Instruction is in wrong basic block.");
14108 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
14110 ScheduleEnd =
I->getNextNode();
14112 CheckScheduleForI(
I);
14113 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14114 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14118void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14120 ScheduleData *PrevLoadStore,
14121 ScheduleData *NextLoadStore) {
14122 ScheduleData *CurrentLoadStore = PrevLoadStore;
14127 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14129 SD = allocateScheduleDataChunks();
14130 ScheduleDataMap[
I] = SD;
14133 assert(!isInSchedulingRegion(SD) &&
14134 "new ScheduleData already in scheduling region");
14135 SD->init(SchedulingRegionID,
I);
14137 if (
I->mayReadOrWriteMemory() &&
14138 (!isa<IntrinsicInst>(
I) ||
14139 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14140 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14141 Intrinsic::pseudoprobe))) {
14143 if (CurrentLoadStore) {
14144 CurrentLoadStore->NextLoadStore = SD;
14146 FirstLoadStoreInRegion = SD;
14148 CurrentLoadStore = SD;
14151 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14152 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14153 RegionHasStackSave =
true;
14155 if (NextLoadStore) {
14156 if (CurrentLoadStore)
14157 CurrentLoadStore->NextLoadStore = NextLoadStore;
14159 LastLoadStoreInRegion = CurrentLoadStore;
14163void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14164 bool InsertInReadyList,
14166 assert(SD->isSchedulingEntity());
14171 while (!WorkList.
empty()) {
14173 for (ScheduleData *BundleMember = SD; BundleMember;
14174 BundleMember = BundleMember->NextInBundle) {
14175 assert(isInSchedulingRegion(BundleMember));
14176 if (BundleMember->hasValidDependencies())
14181 BundleMember->Dependencies = 0;
14182 BundleMember->resetUnscheduledDeps();
14185 if (BundleMember->OpValue != BundleMember->Inst) {
14186 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14187 BundleMember->Dependencies++;
14188 ScheduleData *DestBundle = UseSD->FirstInBundle;
14189 if (!DestBundle->IsScheduled)
14190 BundleMember->incrementUnscheduledDeps(1);
14191 if (!DestBundle->hasValidDependencies())
14195 for (
User *U : BundleMember->Inst->
users()) {
14196 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14197 BundleMember->Dependencies++;
14198 ScheduleData *DestBundle = UseSD->FirstInBundle;
14199 if (!DestBundle->IsScheduled)
14200 BundleMember->incrementUnscheduledDeps(1);
14201 if (!DestBundle->hasValidDependencies())
14208 auto *DepDest = getScheduleData(
I);
14209 assert(DepDest &&
"must be in schedule window");
14210 DepDest->ControlDependencies.push_back(BundleMember);
14211 BundleMember->Dependencies++;
14212 ScheduleData *DestBundle = DepDest->FirstInBundle;
14213 if (!DestBundle->IsScheduled)
14214 BundleMember->incrementUnscheduledDeps(1);
14215 if (!DestBundle->hasValidDependencies())
14223 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14224 I != ScheduleEnd;
I =
I->getNextNode()) {
14229 MakeControlDependent(
I);
14237 if (RegionHasStackSave) {
14241 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14242 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14243 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14244 I != ScheduleEnd;
I =
I->getNextNode()) {
14245 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14246 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14251 if (!isa<AllocaInst>(
I))
14255 MakeControlDependent(
I);
14264 if (isa<AllocaInst>(BundleMember->Inst) ||
14265 BundleMember->Inst->mayReadOrWriteMemory()) {
14266 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14267 I != ScheduleEnd;
I =
I->getNextNode()) {
14268 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14269 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14273 MakeControlDependent(
I);
14280 ScheduleData *DepDest = BundleMember->NextLoadStore;
14285 "NextLoadStore list for non memory effecting bundle?");
14287 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14288 unsigned NumAliased = 0;
14289 unsigned DistToSrc = 1;
14291 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14292 assert(isInSchedulingRegion(DepDest));
14302 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14304 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14311 DepDest->MemoryDependencies.push_back(BundleMember);
14312 BundleMember->Dependencies++;
14313 ScheduleData *DestBundle = DepDest->FirstInBundle;
14314 if (!DestBundle->IsScheduled) {
14315 BundleMember->incrementUnscheduledDeps(1);
14317 if (!DestBundle->hasValidDependencies()) {
14340 if (InsertInReadyList && SD->isReady()) {
14341 ReadyInsts.insert(SD);
14348void BoUpSLP::BlockScheduling::resetSchedule() {
14350 "tried to reset schedule on block which has not been scheduled");
14351 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14352 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14353 assert(isInSchedulingRegion(SD) &&
14354 "ScheduleData not in scheduling region");
14355 SD->IsScheduled =
false;
14356 SD->resetUnscheduledDeps();
14359 ReadyInsts.clear();
14362void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14363 if (!BS->ScheduleStart)
14366 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14373 BS->resetSchedule();
14380 struct ScheduleDataCompare {
14381 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14382 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14385 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14390 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14391 I =
I->getNextNode()) {
14392 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14393 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14396 SD->isPartOfBundle() ==
14398 "scheduler and vectorizer bundle mismatch");
14399 SD->FirstInBundle->SchedulingPriority =
Idx++;
14401 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14402 BS->calculateDependencies(SD,
false,
this);
14405 BS->initialFillReadyList(ReadyInsts);
14407 Instruction *LastScheduledInst = BS->ScheduleEnd;
14410 while (!ReadyInsts.empty()) {
14411 ScheduleData *Picked = *ReadyInsts.begin();
14412 ReadyInsts.erase(ReadyInsts.begin());
14416 for (ScheduleData *BundleMember = Picked; BundleMember;
14417 BundleMember = BundleMember->NextInBundle) {
14421 LastScheduledInst = PickedInst;
14424 BS->schedule(Picked, ReadyInsts);
14428#ifdef EXPENSIVE_CHECKS
14432#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14434 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
14435 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
14436 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14437 assert(SD->IsScheduled &&
"must be scheduled at this point");
14444 BS->ScheduleStart =
nullptr;
14451 if (
auto *Store = dyn_cast<StoreInst>(V))
14452 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14454 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
14457 auto E = InstrElementSize.
find(V);
14458 if (E != InstrElementSize.
end())
14467 if (
auto *
I = dyn_cast<Instruction>(V)) {
14475 Value *FirstNonBool =
nullptr;
14476 while (!Worklist.
empty()) {
14481 auto *Ty =
I->getType();
14482 if (isa<VectorType>(Ty))
14484 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
14491 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
14492 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
14500 for (
Use &U :
I->operands()) {
14501 if (
auto *J = dyn_cast<Instruction>(U.get()))
14502 if (Visited.
insert(J).second &&
14503 (isa<PHINode>(
I) || J->getParent() == Parent)) {
14507 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
14508 FirstNonBool = U.get();
14519 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
14521 Width =
DL->getTypeSizeInBits(V->getType());
14525 InstrElementSize[
I] = Width;
14530bool BoUpSLP::collectValuesToDemote(
14531 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
14533 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
14534 bool IsTruncRoot)
const {
14536 if (
all_of(E.Scalars, IsaPred<Constant>))
14539 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
14548 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
14557 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14561 if (
auto *
I = dyn_cast<Instruction>(V)) {
14563 unsigned BitWidth2 =
14564 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14565 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14571 BitWidth1 = std::min(BitWidth1, BitWidth2);
14576 using namespace std::placeholders;
14577 auto FinalAnalysis = [&]() {
14578 if (!IsProfitableToDemote)
14581 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
14583 if (Res && E.State == TreeEntry::NeedToGather &&
14584 all_of(E.Scalars, IsaPred<Constant>))
14589 if (E.State == TreeEntry::NeedToGather || !Visited.
insert(&E).second ||
14591 return all_of(V->users(), [&](User *U) {
14592 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14595 return FinalAnalysis();
14598 return !all_of(V->users(), [=](User *U) {
14599 return getTreeEntry(U) ||
14600 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14601 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14602 !U->getType()->isScalableTy() &&
14603 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14604 }) && !IsPotentiallyTruncated(V,
BitWidth);
14609 bool &NeedToExit) {
14610 NeedToExit =
false;
14611 unsigned InitLevel = MaxDepthLevel;
14613 unsigned Level = InitLevel;
14614 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
14615 ToDemote, Visited, Level, IsProfitableToDemote,
14617 if (!IsProfitableToDemote)
14620 if (!FinalAnalysis())
14624 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14628 auto AttemptCheckBitwidth =
14631 NeedToExit =
false;
14632 unsigned BestFailBitwidth = 0;
14634 if (Checker(
BitWidth, OrigBitWidth))
14636 if (BestFailBitwidth == 0 && FinalAnalysis())
14640 if (BestFailBitwidth == 0) {
14651 auto TryProcessInstruction =
14658 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14663 if (E.UserTreeIndices.size() > 1 &&
14664 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14667 bool NeedToExit =
false;
14668 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14672 if (!ProcessOperands(
Operands, NeedToExit))
14681 return IsProfitableToDemote;
14683 switch (E.getOpcode()) {
14687 case Instruction::Trunc:
14688 if (IsProfitableToDemoteRoot)
14689 IsProfitableToDemote =
true;
14690 return TryProcessInstruction(
BitWidth);
14691 case Instruction::ZExt:
14692 case Instruction::SExt:
14693 IsProfitableToDemote =
true;
14694 return TryProcessInstruction(
BitWidth);
14698 case Instruction::Add:
14699 case Instruction::Sub:
14700 case Instruction::Mul:
14701 case Instruction::And:
14702 case Instruction::Or:
14703 case Instruction::Xor: {
14704 return TryProcessInstruction(
14705 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14707 case Instruction::Shl: {
14712 auto *I = cast<Instruction>(V);
14713 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14714 return AmtKnownBits.getMaxValue().ult(BitWidth);
14717 return TryProcessInstruction(
14718 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14720 case Instruction::LShr: {
14724 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14726 auto *I = cast<Instruction>(V);
14727 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14728 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14729 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14730 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14731 SimplifyQuery(*DL));
14734 return TryProcessInstruction(
14735 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14738 case Instruction::AShr: {
14742 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14744 auto *I = cast<Instruction>(V);
14745 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14746 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14747 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14748 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14752 return TryProcessInstruction(
14753 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14756 case Instruction::UDiv:
14757 case Instruction::URem: {
14759 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14762 auto *I = cast<Instruction>(V);
14763 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14764 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14765 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14768 return TryProcessInstruction(
14769 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14773 case Instruction::Select: {
14774 return TryProcessInstruction(
14775 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14780 case Instruction::PHI: {
14781 const unsigned NumOps = E.getNumOperands();
14784 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
14786 return TryProcessInstruction(
BitWidth, Ops);
14789 case Instruction::Call: {
14790 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14794 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
14795 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
14799 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14802 auto *I = cast<Instruction>(V);
14803 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14804 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14805 return MaskedValueIsZero(I->getOperand(0), Mask,
14806 SimplifyQuery(*DL)) &&
14807 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14809 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
14810 "Expected min/max intrinsics only.");
14811 unsigned SignBits = OrigBitWidth -
BitWidth;
14824 if (
ID != Intrinsic::abs) {
14825 Operands.push_back(getOperandEntry(&E, 1));
14826 CallChecker = CompChecker;
14829 std::numeric_limits<InstructionCost::CostType>::max();
14831 unsigned VF = E.Scalars.size();
14841 if (
Cost < BestCost) {
14847 [[maybe_unused]]
bool NeedToExit;
14848 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14858 return FinalAnalysis();
14865 bool IsStoreOrInsertElt =
14866 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14867 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14868 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14869 ExtraBitWidthNodes.
size() <= 1 &&
14870 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14871 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14874 unsigned NodeIdx = 0;
14875 if (IsStoreOrInsertElt &&
14876 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14880 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14881 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
14882 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14885 static_cast<int>(NodeIdx);
14891 bool IsTruncRoot =
false;
14892 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14894 if (NodeIdx != 0 &&
14895 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14896 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
14897 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
14898 IsTruncRoot =
true;
14900 IsProfitableToDemoteRoot =
true;
14905 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
14909 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
14910 bool IsProfitableToDemoteRoot,
unsigned Opcode,
14911 unsigned Limit,
bool IsTruncRoot,
14912 bool IsSignedCmp) {
14914 unsigned VF = E.getVectorFactor();
14915 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14916 if (!TreeRootIT || !Opcode)
14920 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
14923 unsigned NumParts =
14929 unsigned MaxBitWidth = 1u;
14937 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
14938 KnownBits Known = computeKnownBits(R, *DL);
14939 return Known.isNonNegative();
14944 for (
Value *Root : E.Scalars) {
14947 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14963 if (!IsKnownPositive)
14967 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14969 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
14972 if (MaxBitWidth < 8 && MaxBitWidth > 1)
14977 if (NumParts > 1 &&
14983 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
14984 Opcode == Instruction::SExt ||
14985 Opcode == Instruction::ZExt || NumParts > 1;
14990 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
14991 bool NeedToDemote = IsProfitableToDemote;
14993 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
14994 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
14996 (MaxDepthLevel <= Limit &&
14997 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
14998 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
14999 DL->getTypeSizeInBits(TreeRootIT) /
15000 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15006 MaxBitWidth =
bit_ceil(MaxBitWidth);
15008 return MaxBitWidth;
15015 if (UserIgnoreList &&
15016 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15017 for (
Value *V : *UserIgnoreList) {
15019 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
15020 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15023 unsigned BitWidth2 = BitWidth1;
15026 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15028 ReductionBitWidth =
15029 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15031 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15032 ReductionBitWidth = 8;
15034 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
15036 bool IsTopRoot = NodeIdx == 0;
15037 while (NodeIdx < VectorizableTree.size() &&
15038 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15039 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15042 IsTruncRoot =
true;
15044 bool IsSignedCmp =
false;
15045 while (NodeIdx < VectorizableTree.size()) {
15047 unsigned Limit = 2;
15048 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15050 ReductionBitWidth ==
15051 DL->getTypeSizeInBits(
15052 VectorizableTree.front()->Scalars.front()->getType()))
15054 unsigned MaxBitWidth = ComputeMaxBitWidth(
15055 *VectorizableTree[NodeIdx].
get(), IsTopRoot, IsProfitableToDemoteRoot,
15056 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15057 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
15058 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15059 ReductionBitWidth =
bit_ceil(MaxBitWidth);
15060 else if (MaxBitWidth == 0)
15061 ReductionBitWidth = 0;
15064 for (
unsigned Idx : RootDemotes) {
15066 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
15067 if (OrigBitWidth > MaxBitWidth) {
15075 RootDemotes.clear();
15077 IsProfitableToDemoteRoot =
true;
15079 if (ExtraBitWidthNodes.
empty()) {
15080 NodeIdx = VectorizableTree.size();
15082 unsigned NewIdx = 0;
15084 NewIdx = *ExtraBitWidthNodes.
begin();
15085 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
15086 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
15089 NodeIdx < VectorizableTree.size() &&
15090 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15093 EI.
UserTE->getOpcode() == Instruction::Trunc &&
15094 !EI.
UserTE->isAltShuffle();
15097 NodeIdx < VectorizableTree.size() &&
15098 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15100 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
15102 auto *IC = dyn_cast<ICmpInst>(V);
15105 !isKnownNonNegative(IC->getOperand(0),
15106 SimplifyQuery(*DL)) ||
15107 !isKnownNonNegative(IC->getOperand(1),
15108 SimplifyQuery(*DL)));
15115 if (MaxBitWidth == 0 ||
15117 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
15118 if (UserIgnoreList)
15125 for (
unsigned Idx : ToDemote) {
15126 TreeEntry *TE = VectorizableTree[
Idx].get();
15129 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15131 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15149 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15174 DL = &
F.getParent()->getDataLayout();
15178 bool Changed =
false;
15184 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15189 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15192 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15196 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15202 DT->updateDFSNumbers();
15205 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15207 R.clearReductionData();
15208 collectSeedInstructions(BB);
15211 if (!Stores.empty()) {
15213 <<
" underlying objects.\n");
15214 Changed |= vectorizeStoreChains(R);
15218 Changed |= vectorizeChainsInBlock(BB, R);
15223 if (!GEPs.
empty()) {
15225 <<
" underlying objects.\n");
15226 Changed |= vectorizeGEPIndices(BB, R);
15231 R.optimizeGatherSequence();
15239 unsigned Idx,
unsigned MinVF,
15244 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15245 unsigned VF = Chain.
size();
15259 for (
Value *V : Chain)
15260 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
15263 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
15268 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15269 (!S.MainOp->isSafeToRemove() ||
15272 return !isa<ExtractElementInst>(V) &&
15273 (V->getNumUses() > Chain.size() ||
15274 any_of(V->users(), [&](User *U) {
15275 return !Stores.contains(U);
15278 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
15279 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15283 if (
R.isLoadCombineCandidate(Chain))
15285 R.buildTree(Chain);
15287 if (
R.isTreeTinyAndNotFullyVectorizable()) {
15288 if (
R.isGathered(Chain.front()) ||
15289 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15290 return std::nullopt;
15291 Size =
R.getTreeSize();
15294 R.reorderTopToBottom();
15295 R.reorderBottomToTop();
15296 R.buildExternalUses();
15298 R.computeMinimumValueSizes();
15299 R.transformNodes();
15301 Size =
R.getTreeSize();
15302 if (S.getOpcode() == Instruction::Load)
15310 using namespace ore;
15313 cast<StoreInst>(Chain[0]))
15314 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
15315 <<
" and with tree size "
15316 <<
NV(
"TreeSize",
R.getTreeSize()));
15330 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15331 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15332 unsigned Size = First ? Val.first : Val.second;
15344 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
15345 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
15346 unsigned P = First ? Val.first : Val.second;
15349 return V + (P - Mean) * (P - Mean);
15352 return Dev * 81 / (Mean * Mean) == 0;
15355bool SLPVectorizerPass::vectorizeStores(
15357 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15362 bool Changed =
false;
15364 struct StoreDistCompare {
15365 bool operator()(
const std::pair<unsigned, int> &Op1,
15366 const std::pair<unsigned, int> &Op2)
const {
15367 return Op1.second < Op2.second;
15372 using StoreIndexToDistSet =
15373 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15374 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
15379 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
15381 PrevDist =
Data.second;
15382 if (
Idx != Set.size() - 1)
15387 Operands.push_back(Stores[DataVar.first]);
15388 PrevDist = DataVar.second;
15393 .
insert({Operands.front(),
15394 cast<StoreInst>(Operands.front())->getValueOperand(),
15396 cast<StoreInst>(Operands.back())->getValueOperand(),
15401 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
15402 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
15406 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15407 unsigned MaxRegVF = MaxVF;
15409 Type *StoreTy =
Store->getValueOperand()->getType();
15410 Type *ValueTy = StoreTy;
15411 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
15412 ValueTy = Trunc->getSrcTy();
15413 if (ValueTy == StoreTy &&
15414 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
15416 unsigned MinVF = std::max<unsigned>(
15418 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15421 if (MaxVF < MinVF) {
15422 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
15424 <<
"MinVF (" << MinVF <<
")\n");
15428 unsigned NonPowerOf2VF = 0;
15433 unsigned CandVF =
Operands.size();
15435 NonPowerOf2VF = CandVF;
15440 unsigned Size = MinVF;
15442 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
15446 unsigned Repeat = 0;
15447 constexpr unsigned MaxAttempts = 4;
15449 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
15450 P.first =
P.second = 1;
15453 auto IsNotVectorized = [](
bool First,
15454 const std::pair<unsigned, unsigned> &
P) {
15455 return First ?
P.first > 0 :
P.second > 0;
15457 auto IsVectorized = [](
bool First,
15458 const std::pair<unsigned, unsigned> &
P) {
15459 return First ?
P.first == 0 :
P.second == 0;
15461 auto VFIsProfitable = [](
bool First,
unsigned Size,
15462 const std::pair<unsigned, unsigned> &
P) {
15465 auto FirstSizeSame = [](
unsigned Size,
15466 const std::pair<unsigned, unsigned> &
P) {
15467 return Size ==
P.first;
15471 bool RepeatChanged =
false;
15472 bool AnyProfitableGraph;
15473 for (
unsigned Size : CandidateVFs) {
15474 AnyProfitableGraph =
false;
15475 unsigned StartIdx = std::distance(
15476 RangeSizes.begin(),
15477 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
15478 std::placeholders::_1)));
15479 while (StartIdx <
End) {
15481 std::distance(RangeSizes.begin(),
15482 find_if(RangeSizes.drop_front(StartIdx),
15483 std::bind(IsVectorized,
Size >= MaxRegVF,
15484 std::placeholders::_1)));
15485 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
15486 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
15488 Size >= MaxRegVF)) {
15495 return cast<StoreInst>(V)
15496 ->getValueOperand()
15498 cast<StoreInst>(Slice.
front())
15499 ->getValueOperand()
15502 "Expected all operands of same type.");
15503 if (!NonSchedulable.empty()) {
15504 auto [NonSchedSizeMax, NonSchedSizeMin] =
15505 NonSchedulable.lookup(Slice.
front());
15506 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
15507 Cnt += NonSchedSizeMax;
15512 std::optional<bool> Res =
15513 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15517 .first->getSecond()
15525 AnyProfitableGraph = RepeatChanged = Changed =
true;
15529 [](std::pair<unsigned, unsigned> &
P) {
15530 P.first = P.second = 0;
15532 if (Cnt < StartIdx + MinVF) {
15533 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15534 [](std::pair<unsigned, unsigned> &
P) {
15535 P.first = P.second = 0;
15537 StartIdx = Cnt +
Size;
15539 if (Cnt > Sz -
Size - MinVF) {
15541 [](std::pair<unsigned, unsigned> &
P) {
15542 P.first = P.second = 0;
15551 if (
Size > 2 && Res &&
15553 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
15554 std::placeholders::_1))) {
15560 if (
Size > MaxRegVF && TreeSize > 1 &&
15562 std::bind(FirstSizeSame, TreeSize,
15563 std::placeholders::_1))) {
15565 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15571 [&](std::pair<unsigned, unsigned> &
P) {
15572 if (Size >= MaxRegVF)
15573 P.second = std::max(P.second, TreeSize);
15575 P.first = std::max(P.first, TreeSize);
15578 AnyProfitableGraph =
true;
15580 if (StartIdx >=
End)
15582 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15583 AnyProfitableGraph =
true;
15584 StartIdx = std::distance(
15585 RangeSizes.begin(),
15586 find_if(RangeSizes.drop_front(Sz),
15587 std::bind(IsNotVectorized,
Size >= MaxRegVF,
15588 std::placeholders::_1)));
15590 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
15594 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
15595 return P.first == 0 &&
P.second == 0;
15599 if (Repeat >= MaxAttempts ||
15600 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15602 constexpr unsigned StoresLimit = 64;
15603 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
15605 static_cast<unsigned>(
15608 RangeSizes.begin(),
15609 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
15610 std::placeholders::_1))) +
15613 if (VF > MaxTotalNum || VF >= StoresLimit)
15615 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
15617 P.first = std::max(
P.second,
P.first);
15621 CandidateVFs.clear();
15622 CandidateVFs.push_back(VF);
15669 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15671 Stores[Set.first]->getValueOperand()->getType(),
15672 Stores[Set.first]->getPointerOperand(),
15673 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
15677 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
15678 if (It == Set.second.end()) {
15679 Set.second.emplace(
Idx, *Diff);
15683 TryToVectorize(Set.second);
15684 StoreIndexToDistSet PrevSet;
15685 PrevSet.swap(Set.second);
15687 Set.second.emplace(
Idx, 0);
15690 unsigned StartIdx = It->first + 1;
15695 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
15697 if (Pair.first <= It->first ||
15698 VectorizedStores.
contains(Stores[Pair.first]))
15700 unsigned BI = Pair.first - StartIdx;
15701 UsedStores.set(BI);
15702 Dists[BI] = Pair.second - It->second;
15704 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
15705 unsigned BI =
I - StartIdx;
15706 if (UsedStores.test(BI))
15707 Set.second.emplace(
I, Dists[BI]);
15711 auto &Res = SortedStores.emplace_back();
15713 Res.second.emplace(
Idx, 0);
15719 SI->getValueOperand()->getType()) {
15720 for (
auto &Set : SortedStores)
15721 TryToVectorize(Set.second);
15722 SortedStores.clear();
15725 FillStoresSet(
I, SI);
15729 for (
auto &Set : SortedStores)
15730 TryToVectorize(Set.second);
15735void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
15746 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
15747 if (!
SI->isSimple())
15757 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
15758 if (
GEP->getNumIndices() != 1)
15761 if (isa<Constant>(
Idx))
15765 if (
GEP->getType()->isVectorTy())
15777 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
15778 << VL.
size() <<
".\n");
15783 if (!S.getOpcode())
15789 for (
Value *V : VL) {
15790 Type *Ty =
V->getType();
15794 R.getORE()->emit([&]() {
15795 std::string TypeStr;
15799 <<
"Cannot SLP vectorize list: type "
15800 << rso.str() +
" is unsupported by vectorizer";
15806 unsigned Sz =
R.getVectorElementSize(I0);
15807 unsigned MinVF =
R.getMinVF(Sz);
15808 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
15809 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15811 R.getORE()->emit([&]() {
15813 <<
"Cannot SLP vectorize list: vectorization factor "
15814 <<
"less than 2 is not supported";
15819 bool Changed =
false;
15820 bool CandidateFound =
false;
15822 Type *ScalarTy = VL[0]->getType();
15823 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15824 ScalarTy =
IE->getOperand(1)->getType();
15826 unsigned NextInst = 0, MaxInst = VL.size();
15827 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15834 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
15835 unsigned ActualVF = std::min(MaxInst -
I, VF);
15840 if (MaxVFOnly && ActualVF < MaxVF)
15842 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15848 auto *
I = dyn_cast<Instruction>(V);
15849 return I &&
R.isDeleted(
I);
15853 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
15857 if (
R.isTreeTinyAndNotFullyVectorizable())
15859 R.reorderTopToBottom();
15860 R.reorderBottomToTop(
15861 !isa<InsertElementInst>(Ops.
front()) &&
15862 !
R.doesRootHaveInTreeUses());
15863 R.buildExternalUses();
15865 R.computeMinimumValueSizes();
15866 R.transformNodes();
15868 CandidateFound =
true;
15869 MinCost = std::min(MinCost,
Cost);
15872 <<
" for VF=" << ActualVF <<
"\n");
15876 cast<Instruction>(Ops[0]))
15877 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
15878 <<
" and with tree size "
15879 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
15890 if (!Changed && CandidateFound) {
15891 R.getORE()->emit([&]() {
15893 <<
"List vectorization was possible but not beneficial with cost "
15894 <<
ore::NV(
"Cost", MinCost) <<
" >= "
15897 }
else if (!Changed) {
15898 R.getORE()->emit([&]() {
15900 <<
"Cannot SLP vectorize list: vectorization was impossible"
15901 <<
" with available vectorization factors";
15911 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
15917 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
15918 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
15919 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
15926 auto *
A = dyn_cast<BinaryOperator>(Op0);
15927 auto *
B = dyn_cast<BinaryOperator>(Op1);
15929 if (
A &&
B &&
B->hasOneUse()) {
15930 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
15931 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
15932 if (B0 && B0->getParent() ==
P)
15934 if (B1 && B1->getParent() ==
P)
15938 if (
B &&
A &&
A->hasOneUse()) {
15939 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
15940 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
15941 if (A0 && A0->getParent() ==
P)
15943 if (A1 && A1->getParent() ==
P)
15947 if (Candidates.
size() == 1)
15948 return tryToVectorizeList({Op0, Op1},
R);
15951 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
15952 if (!BestCandidate)
15954 return tryToVectorizeList(
15955 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
15989 ReductionOpsListType ReductionOps;
16001 bool IsSupportedHorRdxIdentityOp =
false;
16012 return isa<SelectInst>(
I) &&
16018 if (Kind == RecurKind::None)
16026 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16030 return I->getFastMathFlags().noNaNs();
16033 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16036 return I->isAssociative();
16045 return I->getOperand(2);
16046 return I->getOperand(
Index);
16054 case RecurKind::Or:
16060 case RecurKind::And:
16066 case RecurKind::Add:
16067 case RecurKind::Mul:
16068 case RecurKind::Xor:
16069 case RecurKind::FAdd:
16070 case RecurKind::FMul:
16073 case RecurKind::FMax:
16075 case RecurKind::FMin:
16077 case RecurKind::FMaximum:
16079 case RecurKind::FMinimum:
16081 case RecurKind::SMax:
16087 case RecurKind::SMin:
16093 case RecurKind::UMax:
16099 case RecurKind::UMin:
16114 const ReductionOpsListType &ReductionOps) {
16115 bool UseSelect = ReductionOps.size() == 2 ||
16117 (ReductionOps.size() == 1 &&
16118 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16119 assert((!UseSelect || ReductionOps.size() != 2 ||
16120 isa<SelectInst>(ReductionOps[1][0])) &&
16121 "Expected cmp + select pairs for reduction");
16124 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
16138 auto *
I = dyn_cast<Instruction>(V);
16140 return RecurKind::None;
16142 return RecurKind::Add;
16144 return RecurKind::Mul;
16147 return RecurKind::And;
16150 return RecurKind::Or;
16152 return RecurKind::Xor;
16154 return RecurKind::FAdd;
16156 return RecurKind::FMul;
16159 return RecurKind::FMax;
16161 return RecurKind::FMin;
16164 return RecurKind::FMaximum;
16166 return RecurKind::FMinimum;
16172 return RecurKind::SMax;
16174 return RecurKind::SMin;
16176 return RecurKind::UMax;
16178 return RecurKind::UMin;
16180 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
16202 if (!isa<ExtractElementInst>(
RHS) ||
16204 return RecurKind::None;
16206 if (!isa<ExtractElementInst>(
LHS) ||
16208 return RecurKind::None;
16210 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
16211 return RecurKind::None;
16215 return RecurKind::None;
16220 return RecurKind::None;
16223 return RecurKind::SMax;
16226 return RecurKind::SMin;
16229 return RecurKind::UMax;
16232 return RecurKind::UMin;
16235 return RecurKind::None;
16239 static unsigned getFirstOperandIndex(
Instruction *
I) {
16240 return isCmpSelMinMax(
I) ? 1 : 0;
16246 return isCmpSelMinMax(
I) ? 3 : 2;
16252 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
16253 auto *Sel = cast<SelectInst>(
I);
16254 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
16255 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
16257 return I->getParent() == BB;
16261 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
16262 if (IsCmpSelMinMax) {
16265 if (
auto *Sel = dyn_cast<SelectInst>(
I))
16266 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
16267 return I->hasNUses(2);
16271 return I->hasOneUse();
16276 if (isCmpSelMinMax(
I))
16277 ReductionOps.assign(2, ReductionOpsType());
16279 ReductionOps.assign(1, ReductionOpsType());
16284 if (isCmpSelMinMax(
I)) {
16285 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
16286 ReductionOps[1].emplace_back(
I);
16288 ReductionOps[0].emplace_back(
I);
16293 int Sz = Data.size();
16294 auto *
I = dyn_cast<Instruction>(Data.front());
16295 return Sz > 1 ||
isConstant(Data.front()) ||
16306 RdxKind = HorizontalReduction::getRdxKind(Root);
16307 if (!isVectorizable(RdxKind, Root))
16318 if (
auto *Sel = dyn_cast<SelectInst>(Root))
16319 if (!Sel->getCondition()->hasOneUse())
16322 ReductionRoot = Root;
16327 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16336 for (
int I = getFirstOperandIndex(TreeN),
16337 End = getNumberOfOperands(TreeN);
16339 Value *EdgeVal = getRdxOperand(TreeN,
I);
16340 ReducedValsToOps[EdgeVal].push_back(TreeN);
16341 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16344 !hasSameParent(EdgeInst, BB)) {
16345 ExtraArgs.push_back(EdgeVal);
16352 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16353 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16354 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16355 !isVectorizable(RdxKind, EdgeInst) ||
16356 (
R.isAnalyzedReductionRoot(EdgeInst) &&
16357 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16358 PossibleReducedVals.push_back(EdgeVal);
16361 ReductionOps.push_back(EdgeInst);
16370 PossibleReducedVals;
16371 initReductionOps(Root);
16376 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
16379 auto LIt = LoadsMap.
find(
Ptr);
16380 if (LIt != LoadsMap.
end()) {
16381 for (
LoadInst *RLI : LIt->second) {
16387 for (
LoadInst *RLI : LIt->second) {
16391 DoNotReverseVals.
insert(RLI);
16395 if (LIt->second.size() > 2) {
16397 hash_value(LIt->second.back()->getPointerOperand());
16398 DoNotReverseVals.
insert(LIt->second.back());
16403 LoadKeyUsed.
insert(Key);
16408 while (!Worklist.empty()) {
16413 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16416 if (
Args.size() < 2) {
16417 addReductionOps(TreeN);
16419 if (!
Args.empty()) {
16420 assert(
Args.size() == 1 &&
"Expected only single argument.");
16421 ExtraArgs[TreeN] =
Args.front();
16425 for (
Value *V : PossibleRedVals) {
16429 ++PossibleReducedVals[
Key][
Idx]
16430 .
insert(std::make_pair(V, 0))
16433 Worklist.append(PossibleReductionOps.
rbegin(),
16434 PossibleReductionOps.
rend());
16439 ++PossibleReducedVals[
Key][
Idx]
16440 .
insert(std::make_pair(TreeN, 0))
16444 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
16447 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
16448 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
16450 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
16453 auto RedValsVect = It->second.takeVector();
16455 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
16456 PossibleRedValsVect.
back().append(Data.second, Data.first);
16458 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
16459 return P1.size() > P2.size();
16463 if (isGoodForReduction(Data) ||
16464 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16465 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16467 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16471 NewIdx = ReducedVals.
size();
16474 if (DoNotReverseVals.
contains(Data.front()))
16475 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
16477 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
16479 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
16494 constexpr int ReductionLimit = 4;
16495 constexpr unsigned RegMaxNumber = 4;
16496 constexpr unsigned RedValsMaxNumber = 128;
16500 unsigned NumReducedVals =
16501 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
16503 if (!isGoodForReduction(Vals))
16505 return Num + Vals.size();
16507 if (NumReducedVals < ReductionLimit &&
16512 for (ReductionOpsType &RdxOps : ReductionOps)
16513 for (
Value *RdxOp : RdxOps)
16514 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16525 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
16528 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
16531 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16532 assert(Pair.first &&
"DebugLoc must be set.");
16533 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16534 TrackedVals.
try_emplace(Pair.second, Pair.second);
16539 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
16540 assert(isa<SelectInst>(RdxRootInst) &&
16541 "Expected min/max reduction to have select root instruction");
16542 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16543 assert(isa<Instruction>(ScalarCond) &&
16544 "Expected min/max reduction to have compare condition");
16545 return cast<Instruction>(ScalarCond);
16549 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
16550 if (VectorizedTree) {
16553 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16554 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16557 auto It = ReducedValsToOps.
find(Res);
16558 if (It != ReducedValsToOps.
end() &&
16564 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
16570 bool AnyBoolLogicOp =
16572 return isBoolLogicOp(cast<Instruction>(V));
16576 ExternallyUsedValues[ReductionRoot];
16578 ReductionOps.front().size());
16579 for (ReductionOpsType &RdxOps : ReductionOps)
16580 for (
Value *RdxOp : RdxOps) {
16583 IgnoreList.insert(RdxOp);
16588 for (
Value *U : IgnoreList)
16589 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
16590 RdxFMF &= FPMO->getFastMathFlags();
16591 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16596 for (
Value *V : Candidates)
16597 TrackedVals.try_emplace(V, V);
16603 Value *VectorizedTree =
nullptr;
16604 bool CheckForReusedReductionOps =
false;
16606 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
16612 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
16613 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16618 auto *Inst = dyn_cast<Instruction>(RdxVal);
16620 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16621 (S.getOpcode() && !Inst))
16624 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16626 bool ShuffledExtracts =
false;
16628 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16630 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
16631 if (NextS.getOpcode() == Instruction::ExtractElement &&
16632 !NextS.isAltShuffle()) {
16634 for (
Value *RV : ReducedVals[
I + 1]) {
16635 Value *RdxVal = TrackedVals.find(RV)->second;
16639 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
16640 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16642 CommonCandidates.push_back(RdxVal);
16643 TrackedToOrig.try_emplace(RdxVal, RV);
16648 Candidates.
swap(CommonCandidates);
16649 ShuffledExtracts =
true;
16658 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
16660 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
16661 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16662 if (
auto *ResI = dyn_cast<Instruction>(Res))
16663 V.analyzedReductionRoot(ResI);
16665 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16669 unsigned NumReducedVals = Candidates.
size();
16670 if (NumReducedVals < ReductionLimit &&
16677 IsSupportedHorRdxIdentityOp =
16679 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16682 if (IsSupportedHorRdxIdentityOp)
16683 for (
Value *V : Candidates)
16684 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
16695 bool SameScaleFactor =
false;
16696 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16697 SameValuesCounter.
size() != Candidates.size();
16698 if (OptReusedScalars) {
16700 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16701 RdxKind == RecurKind::Xor) &&
16703 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
16704 return P.second == SameValuesCounter.
front().second;
16706 Candidates.resize(SameValuesCounter.
size());
16707 transform(SameValuesCounter, Candidates.begin(),
16708 [](
const auto &
P) { return P.first; });
16709 NumReducedVals = Candidates.size();
16711 if (NumReducedVals == 1) {
16712 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16713 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
16715 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16716 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16717 VectorizedVals.try_emplace(OrigV, Cnt);
16722 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
16723 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
16727 unsigned ReduxWidth = std::min<unsigned>(
16729 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16730 RegMaxNumber * RedValsMaxNumber));
16731 unsigned Start = 0;
16732 unsigned Pos = Start;
16734 unsigned PrevReduxWidth = ReduxWidth;
16735 bool CheckForReusedReductionOpsLocal =
false;
16736 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16737 &CheckForReusedReductionOpsLocal,
16738 &PrevReduxWidth, &
V,
16739 &IgnoreList](
bool IgnoreVL =
false) {
16740 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
16741 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16744 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16747 if (Pos < NumReducedVals - ReduxWidth + 1)
16748 return IsAnyRedOpGathered;
16751 return IsAnyRedOpGathered;
16753 bool AnyVectorized =
false;
16754 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16755 ReduxWidth >= ReductionLimit) {
16758 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16760 CheckForReusedReductionOps =
true;
16763 PrevReduxWidth = ReduxWidth;
16766 if (
V.areAnalyzedReductionVals(VL)) {
16767 (void)AdjustReducedVals(
true);
16773 auto *RedValI = dyn_cast<Instruction>(RedVal);
16776 return V.isDeleted(RedValI);
16779 V.buildTree(VL, IgnoreList);
16780 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
16781 if (!AdjustReducedVals())
16782 V.analyzedReductionVals(VL);
16785 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
16786 if (!AdjustReducedVals())
16787 V.analyzedReductionVals(VL);
16790 V.reorderTopToBottom();
16792 V.reorderBottomToTop(
true);
16796 ExternallyUsedValues);
16797 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
16798 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
16800 for (
Value *V : ReducedVals[Cnt])
16801 if (isa<Instruction>(V))
16802 LocalExternallyUsedValues[TrackedVals[
V]];
16804 if (!IsSupportedHorRdxIdentityOp) {
16807 "Reused values counter map is not empty");
16808 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16809 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16811 Value *
V = Candidates[Cnt];
16812 Value *OrigV = TrackedToOrig.find(V)->second;
16813 ++SameValuesCounter[OrigV];
16819 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16820 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16822 Value *RdxVal = Candidates[Cnt];
16823 if (!Visited.
insert(RdxVal).second)
16827 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
16828 LocalExternallyUsedValues[RdxVal];
16831 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16833 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16834 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
16835 LocalExternallyUsedValues[RdxVal];
16838 if (!IsSupportedHorRdxIdentityOp)
16839 SameValuesCounter.
clear();
16840 for (
Value *RdxVal : VL)
16841 if (RequiredExtract.
contains(RdxVal))
16842 LocalExternallyUsedValues[RdxVal];
16846 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16847 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
16848 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16850 auto RIt = ReplacementToExternal.
find(Ext);
16851 while (RIt != ReplacementToExternal.
end()) {
16853 RIt = ReplacementToExternal.
find(Ext);
16855 auto *It = ExternallyUsedValues.
find(Ext);
16856 if (It == ExternallyUsedValues.
end())
16858 LocalExternallyUsedValues[Pair.second].append(It->second);
16860 V.buildExternalUses(LocalExternallyUsedValues);
16862 V.computeMinimumValueSizes();
16863 V.transformNodes();
16868 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16871 <<
" for reduction\n");
16875 V.getORE()->emit([&]() {
16877 SV_NAME,
"HorSLPNotBeneficial",
16878 ReducedValsToOps.
find(VL[0])->second.front())
16879 <<
"Vectorizing horizontal reduction is possible "
16880 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
16881 <<
" and threshold "
16884 if (!AdjustReducedVals())
16885 V.analyzedReductionVals(VL);
16889 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
16890 <<
Cost <<
". (HorRdx)\n");
16891 V.getORE()->emit([&]() {
16893 SV_NAME,
"VectorizedHorizontalReduction",
16894 ReducedValsToOps.
find(VL[0])->second.front())
16895 <<
"Vectorized horizontal reduction with cost "
16896 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
16897 <<
ore::NV(
"TreeSize",
V.getTreeSize());
16904 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16906 if (IsCmpSelMinMax)
16907 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16910 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
16911 ReplacedExternals, InsertPt);
16918 if ((isBoolLogicOp(RdxRootInst) ||
16919 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16921 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
16924 if (OptReusedScalars && !SameScaleFactor) {
16926 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
16927 SameValuesCounter, TrackedToOrig);
16930 Value *ReducedSubTree =
16931 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
16932 if (ReducedSubTree->
getType() != VL.front()->getType()) {
16934 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
16936 R, cast<Instruction>(ReductionOps.front().front())
16938 ->getDataLayout());
16946 if (OptReusedScalars && SameScaleFactor)
16947 ReducedSubTree = emitScaleForReusedOps(
16948 ReducedSubTree, Builder, SameValuesCounter.
front().second);
16950 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16952 for (
Value *RdxVal : VL) {
16953 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16954 if (IsSupportedHorRdxIdentityOp) {
16955 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
16958 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
16959 if (!
V.isVectorized(RdxVal))
16960 RequiredExtract.
insert(RdxVal);
16965 AnyVectorized =
true;
16967 if (OptReusedScalars && !AnyVectorized) {
16968 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
16969 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
16970 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16971 Value *OrigV = TrackedToOrig.find(
P.first)->second;
16972 VectorizedVals.try_emplace(OrigV,
P.second);
16977 if (VectorizedTree) {
16998 if (!AnyBoolLogicOp)
17000 if (isBoolLogicOp(RedOp1) &&
17001 ((!InitStep &&
LHS == VectorizedTree) ||
17004 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
17005 getRdxOperand(RedOp2, 0) ==
RHS ||
17010 if (
LHS != VectorizedTree)
17021 unsigned Sz = InstVals.
size();
17024 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
17027 Value *RdxVal1 = InstVals[
I].second;
17028 Value *StableRdxVal1 = RdxVal1;
17029 auto It1 = TrackedVals.find(RdxVal1);
17030 if (It1 != TrackedVals.end())
17031 StableRdxVal1 = It1->second;
17032 Value *RdxVal2 = InstVals[
I + 1].second;
17033 Value *StableRdxVal2 = RdxVal2;
17034 auto It2 = TrackedVals.find(RdxVal2);
17035 if (It2 != TrackedVals.end())
17036 StableRdxVal2 = It2->second;
17040 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
17042 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17043 StableRdxVal2,
"op.rdx", ReductionOps);
17044 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
17047 ExtraReds[Sz / 2] = InstVals.
back();
17051 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
17055 for (
Value *RdxVal : Candidates) {
17056 if (!Visited.
insert(RdxVal).second)
17058 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17065 for (
auto &Pair : ExternallyUsedValues) {
17067 for (
auto *
I : Pair.second)
17071 bool InitStep =
true;
17072 while (ExtraReductions.
size() > 1) {
17073 VectorizedTree = ExtraReductions.
front().second;
17075 FinalGen(ExtraReductions, InitStep);
17076 ExtraReductions.
swap(NewReds);
17079 VectorizedTree = ExtraReductions.
front().second;
17081 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17090 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
17097 for (
auto *U :
Ignore->users()) {
17099 "All users must be either in the reduction ops list.");
17102 if (!
Ignore->use_empty()) {
17104 Ignore->replaceAllUsesWith(Undef);
17106 V.eraseInstruction(cast<Instruction>(
Ignore));
17109 }
else if (!CheckForReusedReductionOps) {
17110 for (ReductionOpsType &RdxOps : ReductionOps)
17111 for (
Value *RdxOp : RdxOps)
17112 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17114 return VectorizedTree;
17121 bool IsCmpSelMinMax,
unsigned ReduxWidth,
17124 Type *ScalarTy = ReducedVals.
front()->getType();
17133 int Cnt = ReducedVals.
size();
17134 for (
Value *RdxVal : ReducedVals) {
17139 Cost += GenCostFn();
17144 auto *RdxOp = cast<Instruction>(U);
17145 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17153 Cost += ScalarCost;
17155 Cost += GenCostFn();
17160 case RecurKind::Add:
17161 case RecurKind::Mul:
17162 case RecurKind::Or:
17163 case RecurKind::And:
17164 case RecurKind::Xor:
17165 case RecurKind::FAdd:
17166 case RecurKind::FMul: {
17171 ScalarCost = EvaluateScalarCost([&]() {
17176 case RecurKind::FMax:
17177 case RecurKind::FMin:
17178 case RecurKind::FMaximum:
17179 case RecurKind::FMinimum:
17180 case RecurKind::SMax:
17181 case RecurKind::SMin:
17182 case RecurKind::UMax:
17183 case RecurKind::UMin: {
17187 ScalarCost = EvaluateScalarCost([&]() {
17197 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
17199 <<
" (It is a splitting reduction)\n");
17200 return VectorCost - ScalarCost;
17206 assert(VectorizedValue &&
"Need to have a vectorized tree node");
17208 "We only handle power-of-two reductions for now");
17209 assert(RdxKind != RecurKind::FMulAdd &&
17210 "A call to the llvm.fmuladd intrinsic is not handled yet");
17212 ++NumVectorInstructions;
17219 assert(IsSupportedHorRdxIdentityOp &&
17220 "The optimization of matched scalar identity horizontal reductions "
17221 "must be supported.");
17223 case RecurKind::Add: {
17225 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
17227 << VectorizedValue <<
". (HorRdx)\n");
17228 return Builder.
CreateMul(VectorizedValue, Scale);
17230 case RecurKind::Xor: {
17232 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
17233 <<
". (HorRdx)\n");
17236 return VectorizedValue;
17238 case RecurKind::FAdd: {
17240 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
17242 << VectorizedValue <<
". (HorRdx)\n");
17243 return Builder.
CreateFMul(VectorizedValue, Scale);
17245 case RecurKind::And:
17246 case RecurKind::Or:
17247 case RecurKind::SMax:
17248 case RecurKind::SMin:
17249 case RecurKind::UMax:
17250 case RecurKind::UMin:
17251 case RecurKind::FMax:
17252 case RecurKind::FMin:
17253 case RecurKind::FMaximum:
17254 case RecurKind::FMinimum:
17256 return VectorizedValue;
17257 case RecurKind::Mul:
17258 case RecurKind::FMul:
17259 case RecurKind::FMulAdd:
17260 case RecurKind::IAnyOf:
17261 case RecurKind::FAnyOf:
17262 case RecurKind::None:
17274 assert(IsSupportedHorRdxIdentityOp &&
17275 "The optimization of matched scalar identity horizontal reductions "
17276 "must be supported.");
17277 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
17278 if (VTy->getElementType() != VL.
front()->getType()) {
17284 R, cast<Instruction>(ReductionOps.front().front())
17286 ->getDataLayout());
17291 case RecurKind::Add: {
17294 for (
Value *V : VL) {
17295 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17296 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
17300 << VectorizedValue <<
". (HorRdx)\n");
17301 return Builder.
CreateMul(VectorizedValue, Scale);
17303 case RecurKind::And:
17304 case RecurKind::Or:
17307 <<
". (HorRdx)\n");
17308 return VectorizedValue;
17309 case RecurKind::SMax:
17310 case RecurKind::SMin:
17311 case RecurKind::UMax:
17312 case RecurKind::UMin:
17313 case RecurKind::FMax:
17314 case RecurKind::FMin:
17315 case RecurKind::FMaximum:
17316 case RecurKind::FMinimum:
17319 <<
". (HorRdx)\n");
17320 return VectorizedValue;
17321 case RecurKind::Xor: {
17327 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
17329 std::iota(
Mask.begin(),
Mask.end(), 0);
17330 bool NeedShuffle =
false;
17331 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
17333 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17334 if (Cnt % 2 == 0) {
17336 NeedShuffle =
true;
17342 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
17346 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
17347 return VectorizedValue;
17349 case RecurKind::FAdd: {
17352 for (
Value *V : VL) {
17353 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17354 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17357 return Builder.
CreateFMul(VectorizedValue, Scale);
17359 case RecurKind::Mul:
17360 case RecurKind::FMul:
17361 case RecurKind::FMulAdd:
17362 case RecurKind::IAnyOf:
17363 case RecurKind::FAnyOf:
17364 case RecurKind::None:
17374 return HorizontalReduction::getRdxKind(V);
17377 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17378 return cast<FixedVectorType>(IE->getType())->getNumElements();
17380 unsigned AggregateSize = 1;
17381 auto *
IV = cast<InsertValueInst>(InsertInst);
17382 Type *CurrentType =
IV->getType();
17384 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
17385 for (
auto *Elt : ST->elements())
17386 if (Elt != ST->getElementType(0))
17387 return std::nullopt;
17388 AggregateSize *= ST->getNumElements();
17389 CurrentType = ST->getElementType(0);
17390 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17391 AggregateSize *= AT->getNumElements();
17392 CurrentType = AT->getElementType();
17393 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17394 AggregateSize *= VT->getNumElements();
17395 return AggregateSize;
17397 return AggregateSize;
17399 return std::nullopt;
17408 unsigned OperandOffset) {
17411 std::optional<unsigned> OperandIndex =
17415 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17417 BuildVectorOpds, InsertElts, *OperandIndex);
17420 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17421 InsertElts[*OperandIndex] = LastInsertInst;
17423 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
17424 }
while (LastInsertInst !=
nullptr &&
17425 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17448 assert((isa<InsertElementInst>(LastInsertInst) ||
17449 isa<InsertValueInst>(LastInsertInst)) &&
17450 "Expected insertelement or insertvalue instruction!");
17453 "Expected empty result vectors!");
17456 if (!AggregateSize)
17458 BuildVectorOpds.
resize(*AggregateSize);
17459 InsertElts.
resize(*AggregateSize);
17464 if (BuildVectorOpds.
size() >= 2)
17482 auto DominatedReduxValue = [&](
Value *R) {
17483 return isa<Instruction>(R) &&
17484 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
17490 if (
P->getIncomingBlock(0) == ParentBB) {
17491 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17492 }
else if (
P->getIncomingBlock(1) == ParentBB) {
17493 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17496 if (Rdx && DominatedReduxValue(Rdx))
17509 if (
P->getIncomingBlock(0) == BBLatch) {
17510 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17511 }
else if (
P->getIncomingBlock(1) == BBLatch) {
17512 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17515 if (Rdx && DominatedReduxValue(Rdx))
17549 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17550 isa<IntrinsicInst>(Root)) &&
17551 "Expected binop, select, or intrinsic for reduction matching");
17553 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17555 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17557 return dyn_cast<Instruction>(
RHS);
17559 return dyn_cast<Instruction>(
LHS);
17566 Value *Op0 =
nullptr;
17567 Value *Op1 =
nullptr;
17570 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17576 Value *B0 =
nullptr, *B1 =
nullptr;
17581bool SLPVectorizerPass::vectorizeHorReduction(
17586 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
17588 if (Root->
getParent() != BB || isa<PHINode>(Root))
17592 auto SelectRoot = [&]() {
17611 std::queue<std::pair<Instruction *, unsigned>>
Stack;
17612 Stack.emplace(SelectRoot(), 0);
17616 if (
R.isAnalyzedReductionRoot(Inst))
17621 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
17623 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
17625 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
17626 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17633 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17638 while (!
Stack.empty()) {
17641 std::tie(Inst, Level) =
Stack.front();
17646 if (
R.isDeleted(Inst))
17648 if (
Value *VectorizedV = TryToReduce(Inst)) {
17650 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
17652 Stack.emplace(
I, Level);
17657 if (!TryAppendToPostponedInsts(Inst)) {
17668 if (VisitedInstrs.
insert(
Op).second)
17669 if (
auto *
I = dyn_cast<Instruction>(
Op))
17672 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
17673 !
R.isDeleted(
I) &&
I->getParent() == BB)
17674 Stack.emplace(
I, Level);
17683 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
17684 Res |= tryToVectorize(PostponedInsts, R);
17691 for (
Value *V : Insts)
17692 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
17693 Res |= tryToVectorize(Inst, R);
17697bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
17699 if (!
R.canMapToVector(IVI->
getType()))
17707 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
17709 return tryToVectorizeList(BuildVectorOpds, R);
17718 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17722 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
17723 return tryToVectorizeList(BuildVectorInsts, R);
17726template <
typename T>
17731 bool MaxVFOnly,
BoUpSLP &R) {
17732 bool Changed =
false;
17741 auto *SameTypeIt = IncIt;
17742 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17746 unsigned NumElts = (SameTypeIt - IncIt);
17747 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
17748 << NumElts <<
")\n");
17759 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17765 auto GetMinNumElements = [&R](
Value *V) {
17766 unsigned EltSize = R.getVectorElementSize(V);
17767 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17769 if (NumElts < GetMinNumElements(*IncIt) &&
17770 (Candidates.
empty() ||
17771 Candidates.
front()->getType() == (*IncIt)->getType())) {
17772 Candidates.
append(IncIt, std::next(IncIt, NumElts));
17776 if (Candidates.
size() > 1 &&
17777 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17778 if (TryToVectorizeHelper(Candidates,
false)) {
17781 }
else if (MaxVFOnly) {
17783 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
17785 auto *SameTypeIt = It;
17786 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
17788 unsigned NumElts = (SameTypeIt - It);
17789 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
17795 Candidates.
clear();
17799 IncIt = SameTypeIt;
17811template <
bool IsCompatibility>
17816 "Expected valid element types only.");
17818 return IsCompatibility;
17819 auto *CI1 = cast<CmpInst>(V);
17820 auto *CI2 = cast<CmpInst>(V2);
17821 if (CI1->getOperand(0)->getType()->getTypeID() <
17823 return !IsCompatibility;
17824 if (CI1->getOperand(0)->getType()->getTypeID() >
17833 if (BasePred1 < BasePred2)
17834 return !IsCompatibility;
17835 if (BasePred1 > BasePred2)
17838 bool CI1Preds = Pred1 == BasePred1;
17839 bool CI2Preds = Pred2 == BasePred1;
17840 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
17841 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
17842 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
17846 return !IsCompatibility;
17849 if (
auto *I1 = dyn_cast<Instruction>(Op1))
17850 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
17851 if (IsCompatibility) {
17852 if (I1->getParent() != I2->getParent())
17859 return NodeI2 !=
nullptr;
17862 assert((NodeI1 == NodeI2) ==
17864 "Different nodes should have different DFS numbers");
17865 if (NodeI1 != NodeI2)
17869 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17871 if (IsCompatibility)
17873 if (I1->getOpcode() != I2->getOpcode())
17874 return I1->getOpcode() < I2->getOpcode();
17877 return IsCompatibility;
17880template <
typename ItT>
17883 bool Changed =
false;
17886 if (
R.isDeleted(
I))
17889 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
17890 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
17894 if (
R.isDeleted(
I))
17896 Changed |= tryToVectorize(
I, R);
17903 return compareCmp<false>(V, V2, *TLI, *DT);
17906 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
17909 return compareCmp<true>(V1, V2, *TLI, *DT);
17916 if (Vals.
size() <= 1)
17918 Changed |= tryToVectorizeSequence<Value>(
17919 Vals, CompareSorter, AreCompatibleCompares,
17922 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
17924 auto *Select = dyn_cast<SelectInst>(U);
17926 Select->getParent() != cast<Instruction>(V)->getParent();
17929 if (ArePossiblyReducedInOtherBlock)
17931 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17937bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17939 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17940 "This function only accepts Insert instructions");
17941 bool OpsChanged =
false;
17944 for (
auto *
I :
reverse(Instructions)) {
17945 if (
R.isDeleted(
I))
17947 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
17950 for (
auto *
I :
reverse(Instructions)) {
17951 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
17953 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
17954 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
17955 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
17956 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
17960 OpsChanged |= tryToVectorize(PostponedInsts, R);
17967 bool Changed =
false;
17974 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
17977 "Expected vectorizable types only.");
17986 if (Opcodes1.
size() < Opcodes2.
size())
17988 if (Opcodes1.
size() > Opcodes2.
size())
17990 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
17993 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
17994 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
17999 return NodeI2 !=
nullptr;
18002 assert((NodeI1 == NodeI2) ==
18004 "Different nodes should have different DFS numbers");
18005 if (NodeI1 != NodeI2)
18008 if (S.getOpcode() && !S.isAltShuffle())
18010 return I1->getOpcode() < I2->getOpcode();
18019 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
18020 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
18028 bool U1 = isa<UndefValue>(Opcodes1[
I]);
18029 bool U2 = isa<UndefValue>(Opcodes2[
I]);
18033 auto ValID1 = Opcodes1[
I]->getValueID();
18034 auto ValID2 = Opcodes2[
I]->getValueID();
18035 if (ValID1 == ValID2)
18037 if (ValID1 < ValID2)
18039 if (ValID1 > ValID2)
18048 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
18053 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
18056 if (V1->getType() !=
V2->getType())
18060 if (Opcodes1.
size() != Opcodes2.
size())
18062 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
18064 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
18066 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
18067 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
18068 if (
I1->getParent() != I2->getParent())
18075 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
18077 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
18083 bool HaveVectorizedPhiNodes =
false;
18094 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
18107 if (!Opcodes.
empty())
18111 while (!Nodes.empty()) {
18112 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
18115 for (
Value *V :
PHI->incoming_values()) {
18116 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
18117 Nodes.push_back(PHI1);
18125 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18126 Incoming, PHICompare, AreCompatiblePHIs,
18128 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18131 Changed |= HaveVectorizedPhiNodes;
18133 }
while (HaveVectorizedPhiNodes);
18135 VisitedInstrs.
clear();
18137 InstSetVector PostProcessInserts;
18141 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
18142 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18143 if (VectorizeCmps) {
18144 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
18145 PostProcessCmps.
clear();
18147 PostProcessInserts.clear();
18152 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
18153 return PostProcessCmps.
contains(Cmp);
18154 return isa<InsertElementInst, InsertValueInst>(
I) &&
18155 PostProcessInserts.contains(
I);
18161 return I->use_empty() &&
18162 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
18167 if (isa<ScalableVectorType>(It->getType()))
18171 if (
R.isDeleted(&*It))
18174 if (!VisitedInstrs.
insert(&*It).second) {
18175 if (HasNoUsers(&*It) &&
18176 VectorizeInsertsAndCmps(It->isTerminator())) {
18186 if (isa<DbgInfoIntrinsic>(It))
18190 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
18192 if (
P->getNumIncomingValues() == 2) {
18195 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
18204 for (
unsigned I = 0, E =
P->getNumIncomingValues();
I != E;
I++) {
18209 if (BB ==
P->getIncomingBlock(
I) ||
18210 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
18215 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
18216 PI && !IsInPostProcessInstrs(PI))
18217 Changed |= vectorizeRootInstruction(
nullptr, PI,
18218 P->getIncomingBlock(
I), R,
TTI);
18223 if (HasNoUsers(&*It)) {
18224 bool OpsChanged =
false;
18225 auto *
SI = dyn_cast<StoreInst>(It);
18235 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
18236 SI->getValueOperand()->hasOneUse();
18238 if (TryToVectorizeRoot) {
18239 for (
auto *V : It->operand_values()) {
18242 if (
auto *VI = dyn_cast<Instruction>(V);
18243 VI && !IsInPostProcessInstrs(VI))
18245 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
18252 VectorizeInsertsAndCmps(It->isTerminator());
18263 if (isa<InsertElementInst, InsertValueInst>(It))
18264 PostProcessInserts.insert(&*It);
18265 else if (isa<CmpInst>(It))
18266 PostProcessCmps.
insert(cast<CmpInst>(&*It));
18273 auto Changed =
false;
18274 for (
auto &Entry : GEPs) {
18277 if (Entry.second.size() < 2)
18280 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
18281 << Entry.second.size() <<
".\n");
18288 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18289 unsigned EltSize =
R.getVectorElementSize(*Entry.second[0]->idx_begin());
18290 if (MaxVecRegSize < EltSize)
18293 unsigned MaxElts = MaxVecRegSize / EltSize;
18294 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18295 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18308 Candidates.remove_if([&R](
Value *
I) {
18309 return R.isDeleted(cast<Instruction>(
I)) ||
18310 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
18318 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
18319 auto *GEPI = GEPList[
I];
18320 if (!Candidates.count(GEPI))
18322 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
18323 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
18324 auto *GEPJ = GEPList[J];
18325 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
18326 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
18327 Candidates.remove(GEPI);
18328 Candidates.remove(GEPJ);
18329 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18330 Candidates.remove(GEPJ);
18337 if (Candidates.
size() < 2)
18344 auto BundleIndex = 0
u;
18345 for (
auto *V : Candidates) {
18346 auto *
GEP = cast<GetElementPtrInst>(V);
18347 auto *GEPIdx =
GEP->idx_begin()->get();
18348 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18349 Bundle[BundleIndex++] = GEPIdx;
18361 Changed |= tryToVectorizeList(Bundle, R);
18367bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
18368 bool Changed =
false;
18373 if (
V->getValueOperand()->getType()->getTypeID() <
18374 V2->getValueOperand()->getType()->getTypeID())
18376 if (
V->getValueOperand()->getType()->getTypeID() >
18377 V2->getValueOperand()->getType()->getTypeID())
18379 if (
V->getPointerOperandType()->getTypeID() <
18380 V2->getPointerOperandType()->getTypeID())
18382 if (
V->getPointerOperandType()->getTypeID() >
18383 V2->getPointerOperandType()->getTypeID())
18386 if (isa<UndefValue>(
V->getValueOperand()) ||
18387 isa<UndefValue>(
V2->getValueOperand()))
18389 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
18390 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18392 DT->getNode(
I1->getParent());
18394 DT->getNode(I2->getParent());
18395 assert(NodeI1 &&
"Should only process reachable instructions");
18396 assert(NodeI2 &&
"Should only process reachable instructions");
18397 assert((NodeI1 == NodeI2) ==
18399 "Different nodes should have different DFS numbers");
18400 if (NodeI1 != NodeI2)
18405 return I1->getOpcode() < I2->getOpcode();
18407 if (isa<Constant>(
V->getValueOperand()) &&
18408 isa<Constant>(
V2->getValueOperand()))
18410 return V->getValueOperand()->getValueID() <
18411 V2->getValueOperand()->getValueID();
18423 isa<UndefValue>(
V2->getValueOperand()))
18426 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18427 if (
I1->getParent() != I2->getParent())
18430 return S.getOpcode() > 0;
18433 isa<Constant>(
V2->getValueOperand()))
18436 V2->getValueOperand()->getValueID();
18441 for (
auto &Pair : Stores) {
18442 if (Pair.second.size() < 2)
18446 << Pair.second.size() <<
".\n");
18455 Pair.second.rend());
18456 Changed |= tryToVectorizeSequence<StoreInst>(
18457 ReversedStores, StoreSorter, AreCompatibleStores,
18459 return vectorizeStores(Candidates, R, Attempted);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
const BasicBlock * getParent() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const