73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Only vectorize if you gain more than this "
123 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
128 cl::desc(
"Attempt to vectorize horizontal reductions"));
133 "Attempt to vectorize horizontal reductions feeding into a store"));
139 cl::desc(
"Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
144 cl::desc(
"Attempt to vectorize for this register size in bits"));
148 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
156 cl::desc(
"Limit the size of the SLP scheduling region per block"));
160 cl::desc(
"Attempt to vectorize for this register size in bits"));
164 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
168 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
174 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
183 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
187 cl::desc(
"The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
192 cl::desc(
"The maximum stride, considered to be profitable."));
196 cl::desc(
"Display the SLP trees with Graphviz"));
200 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
227 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
234 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
241 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
242 !isa<ExtractValueInst, UndefValue>(V))
244 auto *
I = dyn_cast<Instruction>(V);
245 if (!
I || isa<ExtractValueInst>(
I))
247 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
249 if (isa<ExtractElementInst>(
I))
251 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
260 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
276 for (
int I = 1, E = VL.
size();
I < E;
I++) {
277 auto *II = dyn_cast<Instruction>(VL[
I]);
298 Value *FirstNonUndef =
nullptr;
299 for (
Value *V : VL) {
300 if (isa<UndefValue>(V))
302 if (!FirstNonUndef) {
306 if (V != FirstNonUndef)
309 return FirstNonUndef !=
nullptr;
314 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
315 return Cmp->isCommutative();
316 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
324 ICmpInst::Predicate Pred;
325 if (match(U.getUser(),
326 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
337 (BO->getOpcode() == Instruction::FSub &&
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
343 return I->isCommutative();
351 if (
const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
355 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
358 if (CI->getValue().uge(VT->getNumElements()))
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
365 const auto *
IV = cast<InsertValueInst>(InsertInst);
366 Type *CurrentType =
IV->getType();
367 for (
unsigned I :
IV->indices()) {
368 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(
I);
371 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
406 if (MaskArg == UseMask::UndefsAsMask)
410 if (MaskArg == UseMask::FirstArg &&
Value < VF)
411 UseMask.reset(
Value);
412 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
413 UseMask.reset(
Value - VF);
421template <
bool IsPoisonOnly = false>
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
428 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
431 auto *
C = dyn_cast<Constant>(V);
433 if (!UseMask.empty()) {
435 while (
auto *II = dyn_cast<InsertElementInst>(
Base)) {
436 Base = II->getOperand(0);
437 if (isa<T>(II->getOperand(1)))
444 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
452 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
459 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
460 if (
Constant *Elem =
C->getAggregateElement(
I))
462 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
490static std::optional<TargetTransformInfo::ShuffleKind>
492 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
495 auto *EI0 = cast<ExtractElementInst>(*It);
496 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
499 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 =
nullptr;
501 Value *Vec2 =
nullptr;
503 ShuffleMode CommonShuffleMode =
Unknown;
505 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
507 if (isa<UndefValue>(VL[
I]))
509 auto *EI = cast<ExtractElementInst>(VL[
I]);
510 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
512 auto *Vec = EI->getVectorOperand();
517 if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
Size)
519 if (isa<UndefValue>(EI->getIndexOperand()))
521 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
527 unsigned IntIdx =
Idx->getValue().getZExtValue();
531 if (!Vec1 || Vec1 == Vec) {
533 }
else if (!Vec2 || Vec2 == Vec) {
539 if (CommonShuffleMode == Permute)
544 CommonShuffleMode = Permute;
547 CommonShuffleMode =
Select;
550 if (CommonShuffleMode ==
Select && Vec2)
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
568 return CI->getZExtValue();
570 auto *EI = cast<ExtractValueInst>(E);
571 if (EI->getNumIndices() != 1)
573 return *EI->idx_begin();
579struct InstructionsState {
581 Value *OpValue =
nullptr;
592 unsigned getAltOpcode()
const {
597 bool isAltShuffle()
const {
return AltOp != MainOp; }
600 unsigned CheckedOpcode =
I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
604 InstructionsState() =
delete;
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
615 auto *
I = dyn_cast<Instruction>(
Op);
616 if (
I && S.isOpcodeOrAlt(
I))
635 unsigned BaseIndex = 0);
643 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
644 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
656 "Assessing comparisons of different types?");
666 return (BasePred == Pred &&
668 (BasePred == SwappedPred &&
677 unsigned BaseIndex) {
680 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
682 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
686 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
688 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
692 bool SwappedPredsCompatible = [&]() {
696 UniquePreds.
insert(BasePred);
697 UniqueNonSwappedPreds.
insert(BasePred);
698 for (
Value *V : VL) {
699 auto *
I = dyn_cast<CmpInst>(V);
705 UniqueNonSwappedPreds.
insert(CurrentPred);
706 if (!UniquePreds.
contains(CurrentPred) &&
707 !UniquePreds.
contains(SwappedCurrentPred))
708 UniquePreds.
insert(CurrentPred);
713 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
717 auto *IBase = cast<Instruction>(VL[BaseIndex]);
720 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
724 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
726 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
727 auto *
I = cast<Instruction>(VL[Cnt]);
728 unsigned InstOpcode =
I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(
I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
734 AltOpcode = InstOpcode;
738 }
else if (IsCastOp && isa<CastInst>(
I)) {
739 Value *Op0 = IBase->getOperand(0);
741 Value *Op1 =
I->getOperand(0);
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
746 if (Opcode == AltOpcode) {
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
755 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(0)->getType();
758 Type *Ty1 = Inst->getOperand(0)->getType();
760 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
773 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
777 }
else if (BasePred != CurrentPred) {
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
789 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
793 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
794 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
796 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
797 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
798 auto *BaseLI = cast<LoadInst>(IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
801 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
802 auto *
CallBase = cast<CallInst>(IBase);
804 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 Call->op_begin() + Call->getBundleOperandsEndIndex(),
810 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
813 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
816 if (Mappings.
size() != BaseMappings.
size() ||
817 Mappings.
front().ISA != BaseMappings.
front().ISA ||
818 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
819 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
820 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
821 Mappings.
front().Shape.Parameters !=
822 BaseMappings.
front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
828 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
832 cast<Instruction>(VL[AltIndex]));
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(UserInst);
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(UserInst);
854 return (SI->getPointerOperand() == Scalar);
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(UserInst);
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
873 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
880 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
881 return LI->isSimple();
883 return SI->isSimple();
885 return !
MI->isVolatile();
893 bool ExtendingManyInputs =
false) {
897 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
899 (SubMask.
size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
904 Mask.append(SubMask.
begin(), SubMask.
end());
908 int TermValue = std::min(Mask.size(), SubMask.
size());
909 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
911 (!ExtendingManyInputs &&
912 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
914 NewMask[
I] = Mask[SubMask[
I]];
930 const unsigned Sz = Order.
size();
933 for (
unsigned I = 0;
I < Sz; ++
I) {
935 UnusedIndices.
reset(Order[
I]);
937 MaskedIndices.
set(
I);
939 if (MaskedIndices.
none())
942 "Non-synced masked/available indices.");
946 assert(
Idx >= 0 &&
"Indices must be synced.");
958 const unsigned E = Indices.
size();
960 for (
unsigned I = 0;
I < E; ++
I)
961 Mask[Indices[
I]] =
I;
967 assert(!Mask.empty() &&
"Expected non-empty mask.");
971 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
973 Scalars[Mask[
I]] = Prev[
I];
981 auto *
I = dyn_cast<Instruction>(V);
986 auto *IO = dyn_cast<Instruction>(V);
989 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
998 auto *
I = dyn_cast<Instruction>(V);
1002 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1004 auto *IU = dyn_cast<Instruction>(U);
1007 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1023 return !VL.
empty() &&
1027namespace slpvectorizer {
1032 struct ScheduleData;
1057 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1112 return !VectorizableTree.
empty() &&
1113 !VectorizableTree.
front()->UserTreeIndices.empty();
1118 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1119 return VectorizableTree.
front()->Scalars;
1134 VectorizableTree.
clear();
1135 ScalarToTreeEntry.clear();
1136 MultiNodeScalars.clear();
1138 EntryToLastInstruction.clear();
1139 ExternalUses.
clear();
1140 ExternalUsesAsGEPs.clear();
1141 for (
auto &Iter : BlocksSchedules) {
1142 BlockScheduling *BS = Iter.second.get();
1146 ReductionBitWidth = 0;
1147 CastMaxMinBWSizes.reset();
1148 ExtraBitWidthNodes.
clear();
1149 InstrElementSize.clear();
1150 UserIgnoreList =
nullptr;
1151 PostponedGathers.
clear();
1152 ValueToGatherNodes.
clear();
1209 return MaxVecRegSize;
1214 return MinVecRegSize;
1222 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1224 return MaxVF ? MaxVF : UINT_MAX;
1268 bool TryRecursiveCheck =
true)
const;
1292 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1293 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1315 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1316 MaxLevel(MaxLevel) {}
1370 if (isa<LoadInst>(V1)) {
1372 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1377 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1379 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1382 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1385 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1387 ((
int)V1->getNumUses() == NumLanes ||
1388 AllUsersAreInternal(V1, V2)))
1394 auto *LI1 = dyn_cast<LoadInst>(V1);
1395 auto *LI2 = dyn_cast<LoadInst>(V2);
1397 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1402 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1403 LI2->getPointerOperand(),
DL, SE,
true);
1404 if (!Dist || *Dist == 0) {
1407 R.TTI->isLegalMaskedGather(
1415 if (std::abs(*Dist) > NumLanes / 2)
1424 auto *C1 = dyn_cast<Constant>(V1);
1425 auto *C2 = dyn_cast<Constant>(V2);
1439 if (isa<UndefValue>(V2))
1443 Value *EV2 =
nullptr;
1456 int Dist = Idx2 - Idx1;
1459 if (std::abs(Dist) == 0)
1461 if (std::abs(Dist) > NumLanes / 2)
1471 auto *I1 = dyn_cast<Instruction>(V1);
1472 auto *I2 = dyn_cast<Instruction>(V2);
1474 if (I1->getParent() != I2->getParent())
1482 if (S.getOpcode() &&
1483 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1484 !S.isAltShuffle()) &&
1486 return cast<Instruction>(V)->getNumOperands() ==
1487 S.MainOp->getNumOperands();
1493 if (isa<UndefValue>(V2))
1530 int ShallowScoreAtThisLevel =
1539 auto *I1 = dyn_cast<Instruction>(
LHS);
1540 auto *I2 = dyn_cast<Instruction>(
RHS);
1541 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1543 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1544 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1545 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1546 ShallowScoreAtThisLevel))
1547 return ShallowScoreAtThisLevel;
1548 assert(I1 && I2 &&
"Should have early exited.");
1555 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1556 OpIdx1 != NumOperands1; ++OpIdx1) {
1558 int MaxTmpScore = 0;
1559 unsigned MaxOpIdx2 = 0;
1560 bool FoundBest =
false;
1564 ? I2->getNumOperands()
1565 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1566 assert(FromIdx <= ToIdx &&
"Bad index");
1567 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1569 if (Op2Used.
count(OpIdx2))
1574 I1, I2, CurrLevel + 1, std::nullopt);
1577 TmpScore > MaxTmpScore) {
1578 MaxTmpScore = TmpScore;
1585 Op2Used.
insert(MaxOpIdx2);
1586 ShallowScoreAtThisLevel += MaxTmpScore;
1589 return ShallowScoreAtThisLevel;
1620 struct OperandData {
1621 OperandData() =
default;
1622 OperandData(
Value *V,
bool APO,
bool IsUsed)
1623 : V(V), APO(APO), IsUsed(IsUsed) {}
1633 bool IsUsed =
false;
1642 enum class ReorderingMode {
1661 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1662 return OpsVec[OpIdx][Lane];
1666 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1667 return OpsVec[OpIdx][Lane];
1672 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1673 OpIdx != NumOperands; ++OpIdx)
1674 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1676 OpsVec[OpIdx][Lane].IsUsed =
false;
1680 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1681 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1693 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1694 Value *IdxLaneV = getData(
Idx, Lane).V;
1695 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1698 for (
unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1701 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1702 if (!isa<Instruction>(OpIdxLnV))
1704 Uniques.
insert(OpIdxLnV);
1706 int UniquesCount = Uniques.
size();
1707 int UniquesCntWithIdxLaneV =
1708 Uniques.
contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1709 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1710 int UniquesCntWithOpIdxLaneV =
1711 Uniques.
contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1712 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1715 UniquesCntWithOpIdxLaneV) -
1716 (
PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1725 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
1726 Value *IdxLaneV = getData(
Idx, Lane).V;
1727 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1736 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1737 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1739 return R.areAllUsersVectorized(IdxLaneI)
1747 static const int ScoreScaleFactor = 10;
1755 int Lane,
unsigned OpIdx,
unsigned Idx,
1765 int SplatScore = getSplatScore(Lane, OpIdx,
Idx);
1766 if (Score <= -SplatScore) {
1771 Score += SplatScore;
1777 Score *= ScoreScaleFactor;
1778 Score += getExternalUseScore(Lane, OpIdx,
Idx);
1796 std::optional<unsigned>
1797 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
1800 unsigned NumOperands = getNumOperands();
1803 Value *OpLastLane = getData(OpIdx, LastLane).V;
1806 ReorderingMode RMode = ReorderingModes[OpIdx];
1807 if (RMode == ReorderingMode::Failed)
1808 return std::nullopt;
1811 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1817 std::optional<unsigned>
Idx;
1821 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
1828 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1830 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
1832 OperandData &OpData = getData(
Idx, Lane);
1834 bool OpAPO = OpData.APO;
1843 if (OpAPO != OpIdxAPO)
1848 case ReorderingMode::Load:
1849 case ReorderingMode::Constant:
1850 case ReorderingMode::Opcode: {
1851 bool LeftToRight = Lane > LastLane;
1852 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
1853 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
1854 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1855 OpIdx,
Idx, IsUsed);
1856 if (Score >
static_cast<int>(BestOp.Score)) {
1858 BestOp.Score = Score;
1859 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1863 case ReorderingMode::Splat:
1864 if (
Op == OpLastLane)
1867 case ReorderingMode::Failed:
1873 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1877 return std::nullopt;
1884 unsigned getBestLaneToStartReordering()
const {
1885 unsigned Min = UINT_MAX;
1886 unsigned SameOpNumber = 0;
1897 for (
int I = getNumLanes();
I > 0; --
I) {
1898 unsigned Lane =
I - 1;
1899 OperandsOrderData NumFreeOpsHash =
1900 getMaxNumOperandsThatCanBeReordered(Lane);
1903 if (NumFreeOpsHash.NumOfAPOs < Min) {
1904 Min = NumFreeOpsHash.NumOfAPOs;
1905 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1907 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1908 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1909 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1912 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1913 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1914 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
1915 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1916 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
1917 if (It == HashMap.
end())
1918 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1924 unsigned BestLane = 0;
1925 unsigned CntMin = UINT_MAX;
1927 if (
Data.second.first < CntMin) {
1928 CntMin =
Data.second.first;
1929 BestLane =
Data.second.second;
1936 struct OperandsOrderData {
1939 unsigned NumOfAPOs = UINT_MAX;
1942 unsigned NumOpsWithSameOpcodeParent = 0;
1956 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
1957 unsigned CntTrue = 0;
1958 unsigned NumOperands = getNumOperands();
1968 bool AllUndefs =
true;
1969 unsigned NumOpsWithSameOpcodeParent = 0;
1973 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1974 const OperandData &OpData = getData(OpIdx, Lane);
1979 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
1981 I->getParent() != Parent) {
1982 if (NumOpsWithSameOpcodeParent == 0) {
1983 NumOpsWithSameOpcodeParent = 1;
1985 Parent =
I->getParent();
1987 --NumOpsWithSameOpcodeParent;
1990 ++NumOpsWithSameOpcodeParent;
1994 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
1995 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
1999 OperandsOrderData
Data;
2000 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2001 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2009 assert((empty() || VL.
size() == getNumLanes()) &&
2010 "Expected same number of lanes");
2011 assert(isa<Instruction>(VL[0]) &&
"Expected instruction");
2012 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2013 constexpr unsigned IntrinsicNumOperands = 2;
2014 if (isa<IntrinsicInst>(VL[0]))
2015 NumOperands = IntrinsicNumOperands;
2016 OpsVec.
resize(NumOperands);
2017 unsigned NumLanes = VL.
size();
2018 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2019 OpsVec[OpIdx].
resize(NumLanes);
2020 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2021 assert(isa<Instruction>(VL[Lane]) &&
"Expected instruction");
2032 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2033 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2034 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2041 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2044 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2047 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2048 return getData(OpIdx, Lane).V;
2052 bool empty()
const {
return OpsVec.
empty(); }
2055 void clear() { OpsVec.
clear(); }
2060 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2061 bool OpAPO = getData(OpIdx, Lane).APO;
2062 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2066 bool FoundCandidate =
false;
2067 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2068 OperandData &
Data = getData(OpI, Ln);
2069 if (
Data.APO != OpAPO ||
Data.IsUsed)
2072 FoundCandidate =
true;
2077 if (!FoundCandidate)
2086 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R) {
2088 appendOperandsOfVL(RootVL);
2095 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2096 "Expected same num of lanes across all operands");
2097 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2098 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2106 unsigned NumOperands = getNumOperands();
2107 unsigned NumLanes = getNumLanes();
2127 unsigned FirstLane = getBestLaneToStartReordering();
2130 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2131 Value *OpLane0 = getValue(OpIdx, FirstLane);
2134 if (isa<LoadInst>(OpLane0))
2135 ReorderingModes[OpIdx] = ReorderingMode::Load;
2136 else if (isa<Instruction>(OpLane0)) {
2138 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2139 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2141 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2143 else if (isa<Constant>(OpLane0))
2144 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2145 else if (isa<Argument>(OpLane0))
2147 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2150 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2157 auto &&SkipReordering = [
this]() {
2160 for (
const OperandData &
Data : Op0)
2163 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2182 if (SkipReordering())
2185 bool StrategyFailed =
false;
2193 for (
unsigned I = 0;
I < NumOperands; ++
I)
2194 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2196 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2199 int Lane = FirstLane +
Direction * Distance;
2200 if (Lane < 0 || Lane >= (
int)NumLanes)
2203 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2206 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2208 std::optional<unsigned> BestIdx = getBestOperand(
2209 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2216 swap(OpIdx, *BestIdx, Lane);
2219 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2221 StrategyFailed =
true;
2224 if (MainAltOps[OpIdx].
size() != 2) {
2225 OperandData &AltOp = getData(OpIdx, Lane);
2226 InstructionsState OpS =
2228 if (OpS.getOpcode() && OpS.isAltShuffle())
2235 if (!StrategyFailed)
2240#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2243 case ReorderingMode::Load:
2245 case ReorderingMode::Opcode:
2247 case ReorderingMode::Constant:
2249 case ReorderingMode::Splat:
2251 case ReorderingMode::Failed:
2272 const unsigned Indent = 2;
2275 OS <<
"Operand " << Cnt++ <<
"\n";
2276 for (
const OperandData &OpData : OpDataVec) {
2278 if (
Value *V = OpData.V)
2282 OS <<
", APO:" << OpData.APO <<
"}\n";
2304 int BestScore = Limit;
2305 std::optional<int>
Index;
2306 for (
int I : seq<int>(0, Candidates.size())) {
2308 Candidates[
I].second,
2311 if (Score > BestScore) {
2326 DeletedInstructions.insert(
I);
2332 return AnalyzedReductionsRoots.count(
I);
2337 AnalyzedReductionsRoots.insert(
I);
2351 AnalyzedReductionsRoots.clear();
2352 AnalyzedReductionVals.
clear();
2353 AnalyzedMinBWVals.
clear();
2370 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2374 unsigned &MaxDepthLevel,
2375 bool &IsProfitableToDemote,
2376 bool IsTruncRoot)
const;
2386 canReorderOperands(TreeEntry *UserTE,
2393 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2397 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2399 TreeEntry *TE =
nullptr;
2401 TE = getTreeEntry(V);
2402 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2404 auto It = MultiNodeScalars.find(V);
2405 if (It != MultiNodeScalars.end()) {
2406 for (TreeEntry *E : It->second) {
2407 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2415 if (It != VL.
end()) {
2416 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2424 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2425 unsigned OpIdx)
const {
2426 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2427 const_cast<TreeEntry *
>(UserTE), OpIdx);
2431 bool areAllUsersVectorized(
2440 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2444 getCastContextHint(
const TreeEntry &TE)
const;
2453 const EdgeInfo &EI);
2464 bool ResizeAllowed =
false)
const;
2475 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2480 template <
typename BVTy,
typename ResTy,
typename...
Args>
2481 ResTy processBuildVector(
const TreeEntry *E, Args &...Params);
2486 Value *createBuildVector(
const TreeEntry *E);
2492 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2499 std::optional<TargetTransformInfo::ShuffleKind>
2511 unsigned NumParts)
const;
2523 std::optional<TargetTransformInfo::ShuffleKind>
2524 isGatherShuffledSingleRegisterEntry(
2541 isGatherShuffledEntry(
2544 unsigned NumParts,
bool ForOrder =
false);
2554 void setInsertPointAfterBundle(
const TreeEntry *E);
2562 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
2575 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
2591 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
2595 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2612 [Scalars](
Value *V,
int Idx) {
2613 return (isa<UndefValue>(V) &&
2614 Idx == PoisonMaskElem) ||
2615 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2618 if (!ReorderIndices.empty()) {
2625 return IsSame(Scalars, Mask);
2626 if (VL.
size() == ReuseShuffleIndices.size()) {
2628 return IsSame(Scalars, Mask);
2632 return IsSame(Scalars, ReuseShuffleIndices);
2635 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
2636 return State == TreeEntry::NeedToGather &&
2637 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2638 UserTreeIndices.front().UserTE == UserEI.UserTE;
2642 bool hasEqualOperands(
const TreeEntry &TE)
const {
2643 if (
TE.getNumOperands() != getNumOperands())
2646 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
2647 unsigned PrevCount =
Used.count();
2648 for (
unsigned K = 0;
K < E; ++
K) {
2651 if (getOperand(K) ==
TE.getOperand(
I)) {
2657 if (PrevCount ==
Used.count())
2666 unsigned getVectorFactor()
const {
2667 if (!ReuseShuffleIndices.empty())
2668 return ReuseShuffleIndices.size();
2669 return Scalars.
size();
2704 VecTreeTy &Container;
2728 assert(Operands[OpIdx].empty() &&
"Already resized?");
2730 "Number of operands is greater than the number of scalars.");
2736 void setOperandsInOrder() {
2738 auto *I0 = cast<Instruction>(Scalars[0]);
2739 Operands.resize(I0->getNumOperands());
2740 unsigned NumLanes = Scalars.size();
2741 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2742 OpIdx != NumOperands; ++OpIdx) {
2744 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2745 auto *
I = cast<Instruction>(Scalars[Lane]);
2746 assert(
I->getNumOperands() == NumOperands &&
2747 "Expected same number of operands");
2748 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
2772 unsigned getNumOperands()
const {
return Operands.size(); }
2775 Value *getSingleOperand(
unsigned OpIdx)
const {
2777 assert(!Operands[OpIdx].empty() &&
"No operand available");
2782 bool isAltShuffle()
const {
return MainOp != AltOp; }
2785 unsigned CheckedOpcode =
I->getOpcode();
2786 return (getOpcode() == CheckedOpcode ||
2787 getAltOpcode() == CheckedOpcode);
2794 auto *
I = dyn_cast<Instruction>(
Op);
2795 if (
I && isOpcodeOrAlt(
I))
2800 void setOperations(
const InstructionsState &S) {
2814 unsigned getOpcode()
const {
2815 return MainOp ? MainOp->
getOpcode() : 0;
2818 unsigned getAltOpcode()
const {
2824 int findLaneForValue(
Value *V)
const {
2825 unsigned FoundLane = std::distance(Scalars.begin(),
find(Scalars, V));
2826 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2827 if (!ReorderIndices.
empty())
2828 FoundLane = ReorderIndices[FoundLane];
2829 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
2830 if (!ReuseShuffleIndices.
empty()) {
2831 FoundLane = std::distance(ReuseShuffleIndices.
begin(),
2832 find(ReuseShuffleIndices, FoundLane));
2846 bool isNonPowOf2Vec()
const {
2848 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
2849 "Reshuffling not supported with non-power-of-2 vectors yet.");
2850 return IsNonPowerOf2;
2857 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
2858 dbgs() <<
"Operand " << OpI <<
":\n";
2859 for (
const Value *V : Operands[OpI])
2862 dbgs() <<
"Scalars: \n";
2863 for (
Value *V : Scalars)
2865 dbgs() <<
"State: ";
2868 dbgs() <<
"Vectorize\n";
2870 case ScatterVectorize:
2871 dbgs() <<
"ScatterVectorize\n";
2873 case StridedVectorize:
2874 dbgs() <<
"StridedVectorize\n";
2877 dbgs() <<
"NeedToGather\n";
2880 dbgs() <<
"MainOp: ";
2882 dbgs() << *MainOp <<
"\n";
2885 dbgs() <<
"AltOp: ";
2887 dbgs() << *AltOp <<
"\n";
2890 dbgs() <<
"VectorizedValue: ";
2891 if (VectorizedValue)
2892 dbgs() << *VectorizedValue <<
"\n";
2895 dbgs() <<
"ReuseShuffleIndices: ";
2896 if (ReuseShuffleIndices.
empty())
2899 for (
int ReuseIdx : ReuseShuffleIndices)
2900 dbgs() << ReuseIdx <<
", ";
2902 dbgs() <<
"ReorderIndices: ";
2903 for (
unsigned ReorderIdx : ReorderIndices)
2904 dbgs() << ReorderIdx <<
", ";
2906 dbgs() <<
"UserTreeIndices: ";
2907 for (
const auto &EInfo : UserTreeIndices)
2908 dbgs() << EInfo <<
", ";
2915 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
2918 dbgs() <<
"SLP: " << Banner <<
":\n";
2920 dbgs() <<
"SLP: Costs:\n";
2921 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
2922 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
2923 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
2924 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2925 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
2931 std::optional<ScheduleData *> Bundle,
2932 const InstructionsState &S,
2933 const EdgeInfo &UserTreeIdx,
2936 TreeEntry::EntryState EntryState =
2937 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2938 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2939 ReuseShuffleIndices, ReorderIndices);
2943 TreeEntry::EntryState EntryState,
2944 std::optional<ScheduleData *> Bundle,
2945 const InstructionsState &S,
2946 const EdgeInfo &UserTreeIdx,
2949 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2950 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2951 "Need to vectorize gather entry?");
2952 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
2953 TreeEntry *
Last = VectorizableTree.
back().get();
2954 Last->Idx = VectorizableTree.
size() - 1;
2955 Last->State = EntryState;
2956 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
2957 ReuseShuffleIndices.end());
2958 if (ReorderIndices.
empty()) {
2960 Last->setOperations(S);
2963 Last->Scalars.assign(VL.
size(),
nullptr);
2966 if (Idx >= VL.size())
2967 return UndefValue::get(VL.front()->getType());
2971 Last->setOperations(S);
2972 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
2974 if (
Last->State != TreeEntry::NeedToGather) {
2975 for (
Value *V : VL) {
2976 const TreeEntry *
TE = getTreeEntry(V);
2978 "Scalar already in tree!");
2981 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
2984 ScalarToTreeEntry[
V] =
Last;
2987 ScheduleData *BundleMember = *Bundle;
2988 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2991 "Bundle and VL out of sync");
2993 for (
Value *V : VL) {
2998 BundleMember->TE =
Last;
2999 BundleMember = BundleMember->NextInBundle;
3002 assert(!BundleMember &&
"Bundle and VL out of sync");
3005 bool AllConstsOrCasts =
true;
3008 auto *
I = dyn_cast<CastInst>(V);
3009 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3012 if (AllConstsOrCasts)
3014 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3015 MustGather.
insert(VL.begin(), VL.end());
3018 if (UserTreeIdx.UserTE) {
3019 Last->UserTreeIndices.push_back(UserTreeIdx);
3020 assert((!
Last->isNonPowOf2Vec() ||
Last->ReorderIndices.empty()) &&
3021 "Reordering isn't implemented for non-power-of-2 nodes yet");
3028 TreeEntry::VecTreeTy VectorizableTree;
3033 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3034 VectorizableTree[
Id]->dump();
3040 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3042 const TreeEntry *getTreeEntry(
Value *V)
const {
3043 return ScalarToTreeEntry.lookup(V);
3052 bool areAltOperandsProfitable(
const InstructionsState &S,
3057 TreeEntry::EntryState getScalarsVectorizationState(
3087 using ValueToGatherNodesMap =
3089 ValueToGatherNodesMap ValueToGatherNodes;
3092 struct ExternalUser {
3116 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3117 auto It = AliasCache.
find(Key);
3118 if (It != AliasCache.
end())
3123 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3127 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3159 UserList ExternalUses;
3179 struct ScheduleData {
3182 enum { InvalidDeps = -1 };
3184 ScheduleData() =
default;
3186 void init(
int BlockSchedulingRegionID,
Value *OpVal) {
3187 FirstInBundle =
this;
3188 NextInBundle =
nullptr;
3189 NextLoadStore =
nullptr;
3190 IsScheduled =
false;
3191 SchedulingRegionID = BlockSchedulingRegionID;
3192 clearDependencies();
3199 if (hasValidDependencies()) {
3200 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3202 assert(UnscheduledDeps == Dependencies &&
"invariant");
3206 assert(isSchedulingEntity() &&
3207 "unexpected scheduled state");
3208 for (
const ScheduleData *BundleMember =
this; BundleMember;
3209 BundleMember = BundleMember->NextInBundle) {
3210 assert(BundleMember->hasValidDependencies() &&
3211 BundleMember->UnscheduledDeps == 0 &&
3212 "unexpected scheduled state");
3213 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3214 "only bundle is marked scheduled");
3218 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3219 "all bundle members must be in same basic block");
3225 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3229 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3233 bool isPartOfBundle()
const {
3234 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3239 bool isReady()
const {
3240 assert(isSchedulingEntity() &&
3241 "can't consider non-scheduling entity for ready list");
3242 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3248 int incrementUnscheduledDeps(
int Incr) {
3249 assert(hasValidDependencies() &&
3250 "increment of unscheduled deps would be meaningless");
3251 UnscheduledDeps += Incr;
3252 return FirstInBundle->unscheduledDepsInBundle();
3257 void resetUnscheduledDeps() {
3258 UnscheduledDeps = Dependencies;
3262 void clearDependencies() {
3263 Dependencies = InvalidDeps;
3264 resetUnscheduledDeps();
3265 MemoryDependencies.clear();
3266 ControlDependencies.clear();
3269 int unscheduledDepsInBundle()
const {
3270 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3272 for (
const ScheduleData *BundleMember =
this; BundleMember;
3273 BundleMember = BundleMember->NextInBundle) {
3274 if (BundleMember->UnscheduledDeps == InvalidDeps)
3276 Sum += BundleMember->UnscheduledDeps;
3282 if (!isSchedulingEntity()) {
3283 os <<
"/ " << *Inst;
3284 }
else if (NextInBundle) {
3286 ScheduleData *SD = NextInBundle;
3288 os <<
';' << *SD->Inst;
3289 SD = SD->NextInBundle;
3300 Value *OpValue =
nullptr;
3303 TreeEntry *
TE =
nullptr;
3307 ScheduleData *FirstInBundle =
nullptr;
3311 ScheduleData *NextInBundle =
nullptr;
3315 ScheduleData *NextLoadStore =
nullptr;
3329 int SchedulingRegionID = 0;
3332 int SchedulingPriority = 0;
3338 int Dependencies = InvalidDeps;
3344 int UnscheduledDeps = InvalidDeps;
3348 bool IsScheduled =
false;
3353 const BoUpSLP::ScheduleData &SD) {
3378 struct BlockScheduling {
3380 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
3384 ScheduleStart =
nullptr;
3385 ScheduleEnd =
nullptr;
3386 FirstLoadStoreInRegion =
nullptr;
3387 LastLoadStoreInRegion =
nullptr;
3388 RegionHasStackSave =
false;
3392 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3395 ScheduleRegionSize = 0;
3399 ++SchedulingRegionID;
3403 if (BB !=
I->getParent())
3406 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3407 if (SD && isInSchedulingRegion(SD))
3412 ScheduleData *getScheduleData(
Value *V) {
3413 if (
auto *
I = dyn_cast<Instruction>(V))
3414 return getScheduleData(
I);
3418 ScheduleData *getScheduleData(
Value *V,
Value *Key) {
3420 return getScheduleData(V);
3421 auto I = ExtraScheduleDataMap.find(V);
3422 if (
I != ExtraScheduleDataMap.end()) {
3423 ScheduleData *SD =
I->second.lookup(Key);
3424 if (SD && isInSchedulingRegion(SD))
3430 bool isInSchedulingRegion(ScheduleData *SD)
const {
3431 return SD->SchedulingRegionID == SchedulingRegionID;
3436 template <
typename ReadyListType>
3437 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3438 SD->IsScheduled =
true;
3441 for (ScheduleData *BundleMember = SD; BundleMember;
3442 BundleMember = BundleMember->NextInBundle) {
3443 if (BundleMember->Inst != BundleMember->OpValue)
3449 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3450 doForAllOpcodes(
I, [&ReadyList](ScheduleData *OpDef) {
3451 if (OpDef && OpDef->hasValidDependencies() &&
3452 OpDef->incrementUnscheduledDeps(-1) == 0) {
3456 ScheduleData *DepBundle = OpDef->FirstInBundle;
3457 assert(!DepBundle->IsScheduled &&
3458 "already scheduled bundle gets ready");
3459 ReadyList.insert(DepBundle);
3461 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3469 if (TreeEntry *TE = BundleMember->TE) {
3471 int Lane = std::distance(
TE->Scalars.begin(),
3472 find(
TE->Scalars, BundleMember->Inst));
3473 assert(Lane >= 0 &&
"Lane not set");
3481 auto *
In = BundleMember->Inst;
3484 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3485 In->getNumOperands() ==
TE->getNumOperands()) &&
3486 "Missed TreeEntry operands?");
3489 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
3490 OpIdx != NumOperands; ++OpIdx)
3491 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
3496 for (
Use &U : BundleMember->Inst->operands())
3497 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
3501 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3502 if (MemoryDepSD->hasValidDependencies() &&
3503 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3506 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3507 assert(!DepBundle->IsScheduled &&
3508 "already scheduled bundle gets ready");
3509 ReadyList.insert(DepBundle);
3511 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
3515 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3516 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3519 ScheduleData *DepBundle = DepSD->FirstInBundle;
3520 assert(!DepBundle->IsScheduled &&
3521 "already scheduled bundle gets ready");
3522 ReadyList.insert(DepBundle);
3524 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
3535 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3536 ScheduleStart->comesBefore(ScheduleEnd) &&
3537 "Not a valid scheduling region?");
3539 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3540 auto *SD = getScheduleData(
I);
3543 assert(isInSchedulingRegion(SD) &&
3544 "primary schedule data not in window?");
3545 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3546 "entire bundle in window!");
3548 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->verify(); });
3551 for (
auto *SD : ReadyInsts) {
3552 assert(SD->isSchedulingEntity() && SD->isReady() &&
3553 "item in ready list not ready?");
3558 void doForAllOpcodes(
Value *V,
3560 if (ScheduleData *SD = getScheduleData(V))
3562 auto I = ExtraScheduleDataMap.find(V);
3563 if (
I != ExtraScheduleDataMap.end())
3564 for (
auto &
P :
I->second)
3565 if (isInSchedulingRegion(
P.second))
3570 template <
typename ReadyListType>
3571 void initialFillReadyList(ReadyListType &ReadyList) {
3572 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
3573 doForAllOpcodes(
I, [&](ScheduleData *SD) {
3574 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3576 ReadyList.insert(SD);
3578 <<
"SLP: initially in ready list: " << *SD <<
"\n");
3593 std::optional<ScheduleData *>
3595 const InstructionsState &S);
3601 ScheduleData *allocateScheduleDataChunks();
3605 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
3610 ScheduleData *PrevLoadStore,
3611 ScheduleData *NextLoadStore);
3615 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
3619 void resetSchedule();
3640 ExtraScheduleDataMap;
3653 ScheduleData *FirstLoadStoreInRegion =
nullptr;
3657 ScheduleData *LastLoadStoreInRegion =
nullptr;
3662 bool RegionHasStackSave =
false;
3665 int ScheduleRegionSize = 0;
3674 int SchedulingRegionID = 1;
3682 void scheduleBlock(BlockScheduling *BS);
3689 struct OrdersTypeDenseMapInfo {
3702 static unsigned getHashValue(
const OrdersType &V) {
3723 unsigned MaxVecRegSize;
3724 unsigned MinVecRegSize;
3739 unsigned ReductionBitWidth = 0;
3743 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3762 struct ChildIteratorType
3764 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3775 return R.VectorizableTree[0].get();
3779 return {
N->UserTreeIndices.begin(),
N->Container};
3783 return {
N->UserTreeIndices.end(),
N->Container};
3788 class nodes_iterator {
3799 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
3803 return nodes_iterator(R->VectorizableTree.begin());
3807 return nodes_iterator(R->VectorizableTree.end());
3810 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
3821 OS << Entry->Idx <<
".\n";
3824 for (
auto *V : Entry->Scalars) {
3826 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
3827 return EU.Scalar == V;
3837 if (Entry->State == TreeEntry::NeedToGather)
3839 if (Entry->State == TreeEntry::ScatterVectorize ||
3840 Entry->State == TreeEntry::StridedVectorize)
3841 return "color=blue";
3850 for (
auto *
I : DeletedInstructions) {
3851 for (
Use &U :
I->operands()) {
3852 auto *
Op = dyn_cast<Instruction>(U.get());
3853 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
3857 I->dropAllReferences();
3859 for (
auto *
I : DeletedInstructions) {
3861 "trying to erase instruction with users.");
3862 I->eraseFromParent();
3868#ifdef EXPENSIVE_CHECKS
3879 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
3880 "Expected non-empty mask.");
3883 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
3885 Reuses[Mask[
I]] = Prev[
I];
3893 bool BottomOrder =
false) {
3894 assert(!Mask.empty() &&
"Expected non-empty mask.");
3895 unsigned Sz = Mask.size();
3898 if (Order.
empty()) {
3900 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
3902 PrevOrder.
swap(Order);
3905 for (
unsigned I = 0;
I < Sz; ++
I)
3907 Order[
I] = PrevOrder[Mask[
I]];
3909 return Data.value() == Sz ||
Data.index() ==
Data.value();
3918 if (Order.
empty()) {
3920 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
3930 for (
unsigned I = 0;
I < Sz; ++
I)
3932 Order[MaskOrder[
I]] =
I;
3936std::optional<BoUpSLP::OrdersType>
3938 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
3942 Type *ScalarTy = GatheredScalars.
front()->getType();
3943 int NumScalars = GatheredScalars.
size();
3945 return std::nullopt;
3948 if (NumParts == 0 || NumParts >= NumScalars)
3954 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
3956 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
3959 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
3960 return std::nullopt;
3961 OrdersType CurrentOrder(NumScalars, NumScalars);
3962 if (GatherShuffles.
size() == 1 &&
3964 Entries.front().front()->isSame(TE.Scalars)) {
3967 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
3968 return CurrentOrder;
3972 return all_of(Mask, [&](
int I) {
3979 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
3980 (Entries.size() != 1 ||
3981 Entries.front().front()->ReorderIndices.empty())) ||
3982 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
3983 return std::nullopt;
3988 for (
int I : seq<int>(0, NumParts)) {
3989 if (ShuffledSubMasks.
test(
I))
3991 const int VF = GetVF(
I);
3996 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
3997 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
3998 ShuffledSubMasks.
set(
I);
4002 int FirstMin = INT_MAX;
4003 int SecondVecFound =
false;
4004 for (
int K : seq<int>(0, PartSz)) {
4005 int Idx = Mask[
I * PartSz + K];
4007 Value *V = GatheredScalars[
I * PartSz + K];
4009 SecondVecFound =
true;
4018 SecondVecFound =
true;
4022 FirstMin = (FirstMin / PartSz) * PartSz;
4024 if (SecondVecFound) {
4025 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4026 ShuffledSubMasks.
set(
I);
4029 for (
int K : seq<int>(0, PartSz)) {
4030 int Idx = Mask[
I * PartSz + K];
4034 if (
Idx >= PartSz) {
4035 SecondVecFound =
true;
4038 if (CurrentOrder[
I * PartSz +
Idx] >
4039 static_cast<unsigned>(
I * PartSz + K) &&
4040 CurrentOrder[
I * PartSz +
Idx] !=
4041 static_cast<unsigned>(
I * PartSz +
Idx))
4042 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4045 if (SecondVecFound) {
4046 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4047 ShuffledSubMasks.
set(
I);
4052 int PartSz = NumScalars / NumParts;
4053 if (!ExtractShuffles.
empty())
4054 TransformMaskToOrder(
4055 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4056 if (!ExtractShuffles[
I])
4059 for (
unsigned Idx : seq<unsigned>(0, PartSz)) {
4060 int K =
I * PartSz +
Idx;
4063 if (!TE.ReuseShuffleIndices.empty())
4064 K = TE.ReuseShuffleIndices[K];
4065 if (!TE.ReorderIndices.empty())
4066 K = std::distance(TE.ReorderIndices.begin(),
4067 find(TE.ReorderIndices, K));
4068 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4071 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4073 .getKnownMinValue());
4078 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4079 if (ShuffledSubMasks.
any())
4080 return std::nullopt;
4081 PartSz = NumScalars;
4084 if (!Entries.empty())
4085 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4086 if (!GatherShuffles[
I])
4088 return std::max(Entries[
I].front()->getVectorFactor(),
4089 Entries[
I].back()->getVectorFactor());
4092 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4093 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4094 return std::nullopt;
4095 return std::move(CurrentOrder);
4100 bool CompareOpcodes =
true) {
4103 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4106 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4109 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4113 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4118template <
typename T>
4120 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4122 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4123 return CommonAlignment;
4128 unsigned Sz = Order.
size();
4130 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4141static std::optional<Value *>
4147 const SCEV *PtrSCEVLowest =
nullptr;
4148 const SCEV *PtrSCEVHighest =
nullptr;
4154 return std::nullopt;
4156 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4157 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4161 if (isa<SCEVCouldNotCompute>(Diff))
4162 return std::nullopt;
4164 PtrSCEVLowest = PtrSCEV;
4168 if (isa<SCEVCouldNotCompute>(Diff1))
4169 return std::nullopt;
4171 PtrSCEVHighest = PtrSCEV;
4177 if (isa<SCEVCouldNotCompute>(Dist))
4178 return std::nullopt;
4179 int Size =
DL.getTypeStoreSize(ElemTy);
4180 auto TryGetStride = [&](
const SCEV *Dist,
4181 const SCEV *Multiplier) ->
const SCEV * {
4182 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4183 if (M->getOperand(0) == Multiplier)
4184 return M->getOperand(1);
4185 if (M->getOperand(1) == Multiplier)
4186 return M->getOperand(0);
4189 if (Multiplier == Dist)
4194 const SCEV *Stride =
nullptr;
4195 if (
Size != 1 || SCEVs.
size() > 2) {
4197 Stride = TryGetStride(Dist, Sz);
4199 return std::nullopt;
4201 if (!Stride || isa<SCEVConstant>(Stride))
4202 return std::nullopt;
4205 using DistOrdPair = std::pair<int64_t, int>;
4207 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4209 bool IsConsecutive =
true;
4210 for (
const SCEV *PtrSCEV : SCEVs) {
4212 if (PtrSCEV != PtrSCEVLowest) {
4214 const SCEV *Coeff = TryGetStride(Diff, Stride);
4216 return std::nullopt;
4217 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4218 if (!SC || isa<SCEVCouldNotCompute>(SC))
4219 return std::nullopt;
4223 return std::nullopt;
4224 Dist = SC->getAPInt().getZExtValue();
4228 return std::nullopt;
4229 auto Res = Offsets.emplace(Dist, Cnt);
4231 return std::nullopt;
4233 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4236 if (Offsets.size() != SCEVs.
size())
4237 return std::nullopt;
4238 SortedIndices.
clear();
4239 if (!IsConsecutive) {
4243 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4244 SortedIndices[Cnt] = Pair.second;
4265 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4271 const unsigned Sz = VL.
size();
4273 auto *POIter = PointerOps.
begin();
4274 for (
Value *V : VL) {
4275 auto *L = cast<LoadInst>(V);
4278 *POIter = L->getPointerOperand();
4289 "supported with VectorizeNonPowerOf2");
4293 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4304 if (Order.
empty()) {
4305 Ptr0 = PointerOps.
front();
4306 PtrN = PointerOps.
back();
4308 Ptr0 = PointerOps[Order.
front()];
4309 PtrN = PointerOps[Order.
back()];
4311 std::optional<int> Diff =
4314 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4317 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4329 (
static_cast<unsigned>(std::abs(*Diff)) <=
4332 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4333 *Diff == -(
static_cast<int>(Sz) - 1))) {
4334 int Stride = *Diff /
static_cast<int>(Sz - 1);
4335 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4347 else if (
Ptr != Ptr0)
4352 if (((Dist / Stride) * Stride) != Dist ||
4353 !Dists.
insert(Dist).second)
4356 if (Dists.
size() == Sz)
4362 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment) {
4363 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4365 unsigned MaxVF = std::max<unsigned>(
bit_floor(VL.
size() / 2), MinVF);
4366 MaxVF = std::min(
getMaximumVF(Sz, Instruction::Load), MaxVF);
4367 for (
unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4368 unsigned VectorizedCnt = 0;
4370 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End;
4371 Cnt += VF, ++VectorizedCnt) {
4389 if (VectorizedCnt == VL.
size() / VF) {
4393 Instruction::Load, VecTy,
4399 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
4403 Instruction::Load, SubVecTy, LI0->getAlign(),
4404 LI0->getPointerAddressSpace(),
CostKind,
4409 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4414 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4419 "Expected only consecutive, strided or masked gather loads.");
4422 for (
int Idx : seq<int>(0, VL.
size()))
4426 ShuffleMask,
CostKind,
I * VF, SubVecTy);
4431 if (MaskedGatherCost > VecLdCost)
4441 bool ProfitableGatherPointers =
4444 return L->isLoopInvariant(V);
4446 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
4447 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
4449 (
GEP &&
GEP->getNumOperands() == 2 &&
4450 isa<Constant, Instruction>(
GEP->getOperand(1)));
4452 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4457 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4476 "Expected list of pointer operands.");
4481 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4486 std::optional<int> Diff =
4492 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
4498 if (Bases.
size() > VL.
size() / 2 - 1)
4502 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
4508 bool AnyConsecutive =
false;
4509 for (
auto &
Base : Bases) {
4510 auto &Vec =
Base.second;
4511 if (Vec.size() > 1) {
4513 const std::tuple<Value *, int, unsigned> &
Y) {
4514 return std::get<1>(
X) < std::get<1>(
Y);
4516 int InitialOffset = std::get<1>(Vec[0]);
4518 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
4524 SortedIndices.
clear();
4525 if (!AnyConsecutive)
4528 for (
auto &
Base : Bases) {
4529 for (
auto &
T :
Base.second)
4534 "Expected SortedIndices to be the size of VL");
4538std::optional<BoUpSLP::OrdersType>
4540 assert(TE.State == TreeEntry::NeedToGather &&
"Expected gather node only.");
4541 Type *ScalarTy = TE.Scalars[0]->getType();
4544 Ptrs.
reserve(TE.Scalars.size());
4545 for (
Value *V : TE.Scalars) {
4546 auto *L = dyn_cast<LoadInst>(V);
4547 if (!L || !L->isSimple())
4548 return std::nullopt;
4554 return std::move(Order);
4555 return std::nullopt;
4566 if (VU->
getType() != V->getType())
4569 if (!VU->
hasOneUse() && !V->hasOneUse())
4575 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4581 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
4582 bool IsReusedIdx =
false;
4584 if (IE2 == VU && !IE1)
4586 if (IE1 == V && !IE2)
4587 return V->hasOneUse();
4588 if (IE1 && IE1 != V) {
4590 IsReusedIdx |= ReusedIdx.
test(Idx1);
4591 ReusedIdx.
set(Idx1);
4592 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
4595 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4597 if (IE2 && IE2 != VU) {
4599 IsReusedIdx |= ReusedIdx.
test(Idx2);
4600 ReusedIdx.
set(Idx2);
4601 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4604 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4606 }
while (!IsReusedIdx && (IE1 || IE2));
4610std::optional<BoUpSLP::OrdersType>
4613 if (TE.isNonPowOf2Vec())
4614 return std::nullopt;
4618 if (!TE.ReuseShuffleIndices.empty()) {
4620 return std::nullopt;
4628 unsigned Sz = TE.Scalars.size();
4629 if (TE.State == TreeEntry::NeedToGather) {
4630 if (std::optional<OrdersType> CurrentOrder =
4635 ::addMask(Mask, TE.ReuseShuffleIndices);
4636 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4637 unsigned Sz = TE.Scalars.size();
4638 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
4641 Res[
Idx + K * Sz] =
I + K * Sz;
4643 return std::move(Res);
4646 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4648 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4649 return std::nullopt;
4653 if (TE.ReorderIndices.empty())
4654 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4657 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4658 unsigned VF = ReorderMask.
size();
4660 unsigned NumParts = VF / Sz;
4662 for (
unsigned I = 0;
I < VF;
I += Sz) {
4664 unsigned UndefCnt = 0;
4673 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
4675 return std::nullopt;
4677 for (
unsigned K = 0; K < NumParts; ++K)
4678 ResOrder[Val + Sz * K] =
I + K;
4680 return std::move(ResOrder);
4682 unsigned VF = TE.getVectorFactor();
4685 TE.ReuseShuffleIndices.end());
4686 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4688 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4689 return Idx && *Idx < Sz;
4692 if (TE.ReorderIndices.empty())
4693 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
4696 for (
unsigned I = 0;
I < VF; ++
I) {
4697 int &
Idx = ReusedMask[
I];
4700 Value *V = TE.Scalars[ReorderMask[
Idx]];
4702 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
4708 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
4709 auto *It = ResOrder.
begin();
4710 for (
unsigned K = 0; K < VF; K += Sz) {
4714 std::iota(SubMask.begin(), SubMask.end(), 0);
4716 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
4717 std::advance(It, Sz);
4719 if (TE.State == TreeEntry::NeedToGather &&
4721 [](
const auto &
Data) {
return Data.index() ==
Data.value(); }))
4722 return std::nullopt;
4723 return std::move(ResOrder);
4725 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4726 any_of(TE.UserTreeIndices,
4728 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4730 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
4731 return std::nullopt;
4732 if ((TE.State == TreeEntry::Vectorize ||
4733 TE.State == TreeEntry::StridedVectorize) &&
4734 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4735 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4737 return TE.ReorderIndices;
4738 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4739 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
4740 Value *V1 = TE.Scalars[I1];
4741 Value *V2 = TE.Scalars[I2];
4742 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
4748 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
4749 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4750 if (
auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4751 if (
auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4758 if (
auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4759 if (
auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4760 if (EE1->getOperand(0) != EE2->getOperand(0))
4766 auto IsIdentityOrder = [](
const OrdersType &Order) {
4767 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
4772 if (!TE.ReorderIndices.empty())
4773 return TE.ReorderIndices;
4776 std::iota(Phis.begin(), Phis.end(), 0);
4778 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4781 for (
unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4782 ResOrder[Id] = PhiToId[Phis[Id]];
4783 if (IsIdentityOrder(ResOrder))
4784 return std::nullopt;
4785 return std::move(ResOrder);
4787 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4791 if ((TE.getOpcode() == Instruction::ExtractElement ||
4792 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4793 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4795 auto *EE = dyn_cast<ExtractElementInst>(V);
4796 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4801 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4803 if (Reuse || !CurrentOrder.
empty())
4804 return std::move(CurrentOrder);
4812 int Sz = TE.Scalars.size();
4814 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4816 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
4817 if (It == TE.Scalars.begin())
4820 if (It != TE.Scalars.end()) {
4822 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4837 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4840 return std::move(Order);
4845 return std::nullopt;
4846 if (TE.Scalars.size() >= 4)
4850 return CurrentOrder;
4852 return std::nullopt;
4862 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
4864 if (Cluster != FirstCluster)
4870void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
4873 const unsigned Sz =
TE.Scalars.size();
4875 if (
TE.State != TreeEntry::NeedToGather ||
4882 addMask(NewMask,
TE.ReuseShuffleIndices);
4884 TE.ReorderIndices.clear();
4891 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
4892 *
End =
TE.ReuseShuffleIndices.end();
4893 It !=
End; std::advance(It, Sz))
4894 std::iota(It, std::next(It, Sz), 0);
4900 "Expected same size of orders");
4901 unsigned Sz = Order.
size();
4903 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
4904 if (Order[
Idx] != Sz)
4905 UsedIndices.
set(Order[
Idx]);
4907 if (SecondaryOrder.
empty()) {
4908 for (
unsigned Idx : seq<unsigned>(0, Sz))
4909 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
4912 for (
unsigned Idx : seq<unsigned>(0, Sz))
4913 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
4914 !UsedIndices.
test(SecondaryOrder[
Idx]))
4915 Order[
Idx] = SecondaryOrder[
Idx];
4935 ExternalUserReorderMap;
4940 const std::unique_ptr<TreeEntry> &TE) {
4943 findExternalStoreUsersReorderIndices(TE.get());
4944 if (!ExternalUserReorderIndices.
empty()) {
4945 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4947 std::move(ExternalUserReorderIndices));
4953 if (TE->isAltShuffle()) {
4956 unsigned Opcode0 = TE->getOpcode();
4957 unsigned Opcode1 = TE->getAltOpcode();
4960 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
4961 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
4962 OpcodeMask.
set(Lane);
4964 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4965 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4971 if (std::optional<OrdersType> CurrentOrder =
4981 const TreeEntry *UserTE = TE.get();
4983 if (UserTE->UserTreeIndices.size() != 1)
4986 return EI.UserTE->State == TreeEntry::Vectorize &&
4987 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4990 UserTE = UserTE->UserTreeIndices.back().UserTE;
4993 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
4994 if (!(TE->State == TreeEntry::Vectorize ||
4995 TE->State == TreeEntry::StridedVectorize) ||
4996 !TE->ReuseShuffleIndices.empty())
4997 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
4998 if (TE->State == TreeEntry::Vectorize &&
4999 TE->getOpcode() == Instruction::PHI)
5000 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5005 for (
unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5007 auto It = VFToOrderedEntries.
find(VF);
5008 if (It == VFToOrderedEntries.
end())
5020 for (
const TreeEntry *OpTE : OrderedEntries) {
5023 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5026 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5028 if (OpTE->State == TreeEntry::NeedToGather ||
5029 !OpTE->ReuseShuffleIndices.empty()) {
5030 auto It = GathersToOrders.find(OpTE);
5031 if (It != GathersToOrders.end())
5034 if (OpTE->isAltShuffle()) {
5035 auto It = AltShufflesToOrders.find(OpTE);
5036 if (It != AltShufflesToOrders.end())
5039 if (OpTE->State == TreeEntry::Vectorize &&
5040 OpTE->getOpcode() == Instruction::PHI) {
5041 auto It = PhisToOrders.
find(OpTE);
5042 if (It != PhisToOrders.
end())
5045 return OpTE->ReorderIndices;
5048 auto It = ExternalUserReorderMap.
find(OpTE);
5049 if (It != ExternalUserReorderMap.
end()) {
5050 const auto &ExternalUserReorderIndices = It->second;
5054 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5055 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5056 ExternalUserReorderIndices.size();
5058 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5059 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5066 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5067 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5070 unsigned E = Order.size();
5073 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5076 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5078 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5081 if (OrdersUses.empty())
5084 const unsigned Sz = Order.size();
5085 for (
unsigned Idx : seq<unsigned>(0, Sz))
5086 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5091 unsigned IdentityCnt = 0;
5092 unsigned FilledIdentityCnt = 0;
5094 for (
auto &Pair : OrdersUses) {
5095 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5096 if (!Pair.first.empty())
5097 FilledIdentityCnt += Pair.second;
5098 IdentityCnt += Pair.second;
5103 unsigned Cnt = IdentityCnt;
5104 for (
auto &Pair : OrdersUses) {
5108 if (Cnt < Pair.second ||
5109 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5110 Cnt == Pair.second && !BestOrder.
empty() &&
5111 IsIdentityOrder(BestOrder))) {
5113 BestOrder = Pair.first;
5120 if (IsIdentityOrder(BestOrder))
5126 unsigned E = BestOrder.
size();
5128 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5131 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5133 if (TE->Scalars.size() != VF) {
5134 if (TE->ReuseShuffleIndices.size() == VF) {
5140 return EI.UserTE->Scalars.size() == VF ||
5141 EI.UserTE->Scalars.size() ==
5144 "All users must be of VF size.");
5147 reorderNodeWithReuses(*TE, Mask);
5151 if ((TE->State == TreeEntry::Vectorize ||
5152 TE->State == TreeEntry::StridedVectorize) &&
5155 !TE->isAltShuffle()) {
5159 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5160 TE->reorderOperands(Mask);
5163 TE->reorderOperands(Mask);
5164 assert(TE->ReorderIndices.empty() &&
5165 "Expected empty reorder sequence.");
5168 if (!TE->ReuseShuffleIndices.empty()) {
5175 addMask(NewReuses, TE->ReuseShuffleIndices);
5176 TE->ReuseShuffleIndices.swap(NewReuses);
5182bool BoUpSLP::canReorderOperands(
5183 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5187 if (UserTE->isNonPowOf2Vec())
5190 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
5191 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5192 return OpData.first ==
I &&
5193 (OpData.second->State == TreeEntry::Vectorize ||
5194 OpData.second->State == TreeEntry::StridedVectorize);
5197 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5199 if (
any_of(TE->UserTreeIndices,
5200 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5204 Edges.emplace_back(
I, TE);
5210 if (TE->State != TreeEntry::Vectorize &&
5211 TE->State != TreeEntry::StridedVectorize &&
5212 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5216 TreeEntry *
Gather =
nullptr;
5218 [&
Gather, UserTE,
I](TreeEntry *TE) {
5219 assert(TE->State != TreeEntry::Vectorize &&
5220 TE->State != TreeEntry::StridedVectorize &&
5221 "Only non-vectorized nodes are expected.");
5222 if (
any_of(TE->UserTreeIndices,
5223 [UserTE,
I](
const EdgeInfo &EI) {
5224 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5226 assert(TE->isSame(UserTE->getOperand(
I)) &&
5227 "Operand entry does not match operands.");
5248 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5249 if (TE->State != TreeEntry::Vectorize &&
5250 TE->State != TreeEntry::StridedVectorize)
5252 if (std::optional<OrdersType> CurrentOrder =
5254 OrderedEntries.
insert(TE.get());
5255 if (!(TE->State == TreeEntry::Vectorize ||
5256 TE->State == TreeEntry::StridedVectorize) ||
5257 !TE->ReuseShuffleIndices.empty())
5258 GathersToOrders.
insert(TE.get());
5267 while (!OrderedEntries.
empty()) {
5272 for (TreeEntry *TE : OrderedEntries) {
5273 if (!(TE->State == TreeEntry::Vectorize ||
5274 TE->State == TreeEntry::StridedVectorize ||
5275 (TE->State == TreeEntry::NeedToGather &&
5277 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5280 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5282 !Visited.
insert(TE).second) {
5288 for (
EdgeInfo &EI : TE->UserTreeIndices) {
5289 TreeEntry *UserTE = EI.
UserTE;
5290 auto It =
Users.find(UserTE);
5291 if (It ==
Users.end())
5292 It =
Users.insert({UserTE, {}}).first;
5293 It->second.emplace_back(EI.
EdgeIdx, TE);
5297 for (TreeEntry *TE : Filtered)
5298 OrderedEntries.remove(TE);
5300 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5302 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5303 return Data1.first->Idx > Data2.first->Idx;
5305 for (
auto &
Data : UsersVec) {
5308 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
5310 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5311 OrderedEntries.remove(
Op.second);
5324 for (
const auto &
Op :
Data.second) {
5325 TreeEntry *OpTE =
Op.second;
5326 if (!VisitedOps.
insert(OpTE).second)
5328 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5330 const auto Order = [&]() ->
const OrdersType {
5331 if (OpTE->State == TreeEntry::NeedToGather ||
5332 !OpTE->ReuseShuffleIndices.empty())
5335 return OpTE->ReorderIndices;
5339 if (Order.size() == 1)
5342 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
5343 return P.second == OpTE;
5346 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5347 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5350 unsigned E = Order.size();
5353 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5356 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5359 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5361 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
5362 const auto AllowsReordering = [&](
const TreeEntry *TE) {
5364 if (TE->isNonPowOf2Vec())
5366 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5367 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5368 (IgnoreReorder && TE->Idx == 0))
5370 if (TE->State == TreeEntry::NeedToGather) {
5379 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
5380 TreeEntry *UserTE = EI.
UserTE;
5381 if (!VisitedUsers.
insert(UserTE).second)
5386 if (AllowsReordering(UserTE))
5394 if (
static_cast<unsigned>(
count_if(
5395 Ops, [UserTE, &AllowsReordering](
5396 const std::pair<unsigned, TreeEntry *> &
Op) {
5397 return AllowsReordering(
Op.second) &&
5400 return EI.UserTE == UserTE;
5402 })) <= Ops.
size() / 2)
5403 ++Res.first->second;
5406 if (OrdersUses.empty()) {
5407 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5408 OrderedEntries.remove(
Op.second);
5412 const unsigned Sz = Order.size();
5413 for (
unsigned Idx : seq<unsigned>(0, Sz))
5414 if (
Idx != Order[
Idx] && Order[
Idx] != Sz)
5419 unsigned IdentityCnt = 0;
5420 unsigned VF =
Data.second.front().second->getVectorFactor();
5422 for (
auto &Pair : OrdersUses) {
5423 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5424 IdentityCnt += Pair.second;
5429 unsigned Cnt = IdentityCnt;
5430 for (
auto &Pair : OrdersUses) {
5434 if (Cnt < Pair.second) {
5436 BestOrder = Pair.first;
5443 if (IsIdentityOrder(BestOrder)) {
5444 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
5445 OrderedEntries.remove(
Op.second);
5454 unsigned E = BestOrder.
size();
5456 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5458 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
5459 TreeEntry *TE =
Op.second;
5460 OrderedEntries.remove(TE);
5461 if (!VisitedOps.
insert(TE).second)
5463 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
5464 reorderNodeWithReuses(*TE, Mask);
5468 if (TE->State != TreeEntry::Vectorize &&
5469 TE->State != TreeEntry::StridedVectorize &&
5470 (TE->State != TreeEntry::ScatterVectorize ||
5471 TE->ReorderIndices.empty()))
5473 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
5474 TE->ReorderIndices.empty()) &&
5475 "Non-matching sizes of user/operand entries.");
5477 if (IgnoreReorder && TE == VectorizableTree.front().get())
5478 IgnoreReorder =
false;
5481 for (TreeEntry *
Gather : GatherOps) {
5483 "Unexpected reordering of gathers.");
5484 if (!
Gather->ReuseShuffleIndices.empty()) {
5490 OrderedEntries.remove(
Gather);
5494 if (
Data.first->State != TreeEntry::Vectorize ||
5495 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5496 Data.first->getMainOp()) ||
5497 Data.first->isAltShuffle())
5498 Data.first->reorderOperands(Mask);
5499 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
5500 Data.first->isAltShuffle() ||
5501 Data.first->State == TreeEntry::StridedVectorize) {
5505 if (
Data.first->ReuseShuffleIndices.empty() &&
5506 !
Data.first->ReorderIndices.empty() &&
5507 !
Data.first->isAltShuffle()) {
5510 OrderedEntries.insert(
Data.first);
5518 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5519 VectorizableTree.front()->ReuseShuffleIndices.empty())
5520 VectorizableTree.front()->ReorderIndices.clear();
5527 for (
auto &TEPtr : VectorizableTree) {
5528 TreeEntry *Entry = TEPtr.get();
5531 if (Entry->State == TreeEntry::NeedToGather)
5535 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5536 Value *Scalar = Entry->Scalars[Lane];
5537 if (!isa<Instruction>(Scalar))
5540 auto It = ScalarToExtUses.
find(Scalar);
5541 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
5545 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
5546 if (ExtI != ExternallyUsedValues.
end()) {
5547 int FoundLane = Entry->findLaneForValue(Scalar);
5548 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
5549 << FoundLane <<
" from " << *Scalar <<
".\n");
5550 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
5551 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
5554 for (
User *U : Scalar->users()) {
5562 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5566 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5570 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5572 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5573 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
5575 assert(UseEntry->State != TreeEntry::NeedToGather &&
"Bad state");
5579 if (It != ScalarToExtUses.
end()) {
5580 ExternalUses[It->second].User =
nullptr;
5585 int FoundLane = Entry->findLaneForValue(Scalar);
5587 <<
" from lane " << FoundLane <<
" from " << *Scalar
5589 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
5590 ExternalUses.emplace_back(Scalar, U, FoundLane);
5599BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
5601 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5602 Value *V = TE->Scalars[Lane];
5608 for (
User *U : V->users()) {
5609 auto *SI = dyn_cast<StoreInst>(U);
5610 if (SI ==
nullptr || !SI->isSimple() ||
5614 if (getTreeEntry(U))
5618 auto &StoresVec = PtrToStoresMap[
Ptr];
5621 if (StoresVec.size() > Lane)
5624 if (!StoresVec.empty() &&
5625 SI->getParent() != StoresVec.back()->getParent())
5628 if (!StoresVec.empty() &&
5629 SI->getValueOperand()->getType() !=
5630 StoresVec.back()->getValueOperand()->getType())
5632 StoresVec.push_back(SI);
5635 return PtrToStoresMap;
5639 OrdersType &ReorderIndices)
const {
5647 StoreOffsetVec[0] = {S0, 0};
5650 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
5652 std::optional<int> Diff =
5654 SI->getPointerOperand(), *
DL, *SE,
5659 StoreOffsetVec[
Idx] = {StoresVec[
Idx], *Diff};
5664 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
5665 const std::pair<StoreInst *, int> &Pair2) {
5666 int Offset1 = Pair1.second;
5667 int Offset2 = Pair2.second;
5668 return Offset1 < Offset2;
5672 for (
unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5673 if (StoreOffsetVec[
Idx].second != StoreOffsetVec[
Idx - 1].second + 1)
5678 ReorderIndices.reserve(StoresVec.
size());
5681 [SI](
const std::pair<StoreInst *, int> &Pair) {
5682 return Pair.first ==
SI;
5684 StoreOffsetVec.begin();
5685 ReorderIndices.push_back(
Idx);
5690 auto IsIdentityOrder = [](
const OrdersType &Order) {
5691 for (
unsigned Idx : seq<unsigned>(0, Order.size()))
5696 if (IsIdentityOrder(ReorderIndices))
5697 ReorderIndices.clear();
5704 for (
unsigned Idx : Order)
5711BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
5712 unsigned NumLanes =
TE->Scalars.size();
5715 collectUserStores(TE);
5724 for (
const auto &Pair : PtrToStoresMap) {
5725 auto &StoresVec = Pair.second;
5727 if (StoresVec.size() != NumLanes)
5732 if (!canFormVector(StoresVec, ReorderIndices))
5737 ExternalReorderIndices.
push_back(ReorderIndices);
5739 return ExternalReorderIndices;
5745 UserIgnoreList = &UserIgnoreLst;
5748 buildTree_rec(Roots, 0,
EdgeInfo());
5755 buildTree_rec(Roots, 0,
EdgeInfo());
5762 Value *NeedsScheduling =
nullptr;
5763 for (
Value *V : VL) {
5766 if (!NeedsScheduling) {
5767 NeedsScheduling = V;
5772 return NeedsScheduling;
5783 bool AllowAlternate) {
5787 if (
auto *LI = dyn_cast<LoadInst>(V)) {
5790 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
5795 if (isa<ExtractElementInst, UndefValue>(V))
5797 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
5799 !isa<UndefValue>(EI->getIndexOperand()))
5802 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
5805 if ((isa<BinaryOperator, CastInst>(
I)) &&
5815 : cast<CastInst>(
I)->getOperand(0)->getType()));
5817 if (isa<CastInst>(
I)) {
5818 std::pair<size_t, size_t> OpVals =
5824 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
5826 if (CI->isCommutative())
5832 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
5846 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
5847 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5848 SubKey =
hash_value(Gep->getPointerOperand());
5852 !isa<ConstantInt>(
I->getOperand(1))) {
5860 return std::make_pair(Key, SubKey);
5870bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
5872 unsigned Opcode0 = S.getOpcode();
5873 unsigned Opcode1 = S.getAltOpcode();
5876 for (
unsigned Lane : seq<unsigned>(0, VL.
size()))
5877 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
5878 OpcodeMask.set(Lane);
5881 Opcode0, Opcode1, OpcodeMask))
5884 for (
unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5888 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
5892 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5898 switch (Res.value_or(0)) {
5913 constexpr unsigned NumAltInsts = 3;
5914 unsigned NonInstCnt = 0;
5917 unsigned UndefCnt = 0;
5919 unsigned ExtraShuffleInsts = 0;
5928 return is_contained(Operands.back(), V);
5931 ++ExtraShuffleInsts;
5948 if (isa<Constant, ExtractElementInst>(V) ||
5949 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
5950 if (isa<UndefValue>(V))
5956 if (!Res.second && Res.first->second == 1)
5957 ++ExtraShuffleInsts;
5958 ++Res.first->getSecond();
5959 if (
auto *
I = dyn_cast<Instruction>(V))
5960 UniqueOpcodes.
insert(
I->getOpcode());
5961 else if (Res.second)
5964 return none_of(Uniques, [&](
const auto &
P) {
5965 return P.first->hasNUsesOrMore(
P.second + 1) &&
5967 return getTreeEntry(U) || Uniques.contains(U);
5976 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5977 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
5978 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5981BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5984 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
5986 unsigned ShuffleOrOp =
5987 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
5988 auto *VL0 = cast<Instruction>(S.OpValue);
5989 switch (ShuffleOrOp) {
5990 case Instruction::PHI: {
5993 for (
Value *
Incoming : cast<PHINode>(V)->incoming_values()) {
5995 if (Term &&
Term->isTerminator()) {
5997 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
5998 return TreeEntry::NeedToGather;
6002 return TreeEntry::Vectorize;
6004 case Instruction::ExtractValue:
6005 case Instruction::ExtractElement: {
6006 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6009 return TreeEntry::NeedToGather;
6010 if (Reuse || !CurrentOrder.empty())
6011 return TreeEntry::Vectorize;
6013 return TreeEntry::NeedToGather;
6015 case Instruction::InsertElement: {
6019 for (
Value *V : VL) {
6020 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
6022 "Non-constant or undef index?");
6026 return !SourceVectors.contains(V);
6029 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
6030 "different source vectors.\n");
6031 return TreeEntry::NeedToGather;
6034 return TreeEntry::Vectorize;
6036 case Instruction::Load: {
6045 return TreeEntry::Vectorize;
6047 return TreeEntry::ScatterVectorize;
6049 return TreeEntry::StridedVectorize;
6052 Type *ScalarTy = VL0->getType();
6053 if (
DL->getTypeSizeInBits(ScalarTy) !=
6054 DL->getTypeAllocSizeInBits(ScalarTy))
6055 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
6057 [](
Value *V) {
return !cast<LoadInst>(V)->isSimple(); }))
6062 return TreeEntry::NeedToGather;
6066 case Instruction::ZExt:
6067 case Instruction::SExt:
6068 case Instruction::FPToUI:
6069 case Instruction::FPToSI:
6070 case Instruction::FPExt:
6071 case Instruction::PtrToInt:
6072 case Instruction::IntToPtr:
6073 case Instruction::SIToFP:
6074 case Instruction::UIToFP:
6075 case Instruction::Trunc:
6076 case Instruction::FPTrunc:
6077 case Instruction::BitCast: {
6078 Type *SrcTy = VL0->getOperand(0)->getType();
6079 for (
Value *V : VL) {
6080 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6083 dbgs() <<
"SLP: Gathering casts with different src types.\n");
6084 return TreeEntry::NeedToGather;
6087 return TreeEntry::Vectorize;
6089 case Instruction::ICmp:
6090 case Instruction::FCmp: {
6094 Type *ComparedTy = VL0->getOperand(0)->getType();
6095 for (
Value *V : VL) {
6097 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
6098 Cmp->getOperand(0)->getType() != ComparedTy) {
6099 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
6100 return TreeEntry::NeedToGather;
6103 return TreeEntry::Vectorize;
6105 case Instruction::Select:
6106 case Instruction::FNeg:
6107 case Instruction::Add:
6108 case Instruction::FAdd:
6109 case Instruction::Sub:
6110 case Instruction::FSub:
6111 case Instruction::Mul:
6112 case Instruction::FMul:
6113 case Instruction::UDiv:
6114 case Instruction::SDiv:
6115 case Instruction::FDiv:
6116 case Instruction::URem:
6117 case Instruction::SRem:
6118 case Instruction::FRem:
6119 case Instruction::Shl:
6120 case Instruction::LShr:
6121 case Instruction::AShr:
6122 case Instruction::And:
6123 case Instruction::Or:
6124 case Instruction::Xor:
6125 return TreeEntry::Vectorize;
6126 case Instruction::GetElementPtr: {
6128 for (
Value *V : VL) {
6129 auto *
I = dyn_cast<GetElementPtrInst>(V);
6132 if (
I->getNumOperands() != 2) {
6133 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
6134 return TreeEntry::NeedToGather;
6140 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6141 for (
Value *V : VL) {
6142 auto *
GEP = dyn_cast<GEPOperator>(V);
6145 Type *CurTy =
GEP->getSourceElementType();
6147 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
6148 return TreeEntry::NeedToGather;
6153 Type *Ty1 = VL0->getOperand(1)->getType();
6154 for (
Value *V : VL) {
6155 auto *
I = dyn_cast<GetElementPtrInst>(V);
6158 auto *
Op =
I->getOperand(1);
6159 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6160 (
Op->getType() != Ty1 &&
6161 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
6162 Op->getType()->getScalarSizeInBits() >
6163 DL->getIndexSizeInBits(
6164 V->getType()->getPointerAddressSpace())))) {
6166 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
6167 return TreeEntry::NeedToGather;
6171 return TreeEntry::Vectorize;
6173 case Instruction::Store: {
6175 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6178 if (
DL->getTypeSizeInBits(ScalarTy) !=
6179 DL->getTypeAllocSizeInBits(ScalarTy)) {
6180 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
6181 return TreeEntry::NeedToGather;
6185 for (
Value *V : VL) {
6186 auto *
SI = cast<StoreInst>(V);
6187 if (!
SI->isSimple()) {
6189 return TreeEntry::NeedToGather;
6198 if (CurrentOrder.empty()) {
6199 Ptr0 = PointerOps.
front();
6200 PtrN = PointerOps.
back();
6202 Ptr0 = PointerOps[CurrentOrder.front()];
6203 PtrN = PointerOps[CurrentOrder.back()];
6205 std::optional<int> Dist =
6208 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
6209 return TreeEntry::Vectorize;
6213 return TreeEntry::NeedToGather;
6215 case Instruction::Call: {
6218 CallInst *CI = cast<CallInst>(VL0);
6229 return TreeEntry::NeedToGather;
6234 for (
unsigned J = 0; J != NumArgs; ++J)
6237 for (
Value *V : VL) {
6238 CallInst *CI2 = dyn_cast<CallInst>(V);
6244 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
6246 return TreeEntry::NeedToGather;
6250 for (
unsigned J = 0; J != NumArgs; ++J) {
6253 if (ScalarArgs[J] != A1J) {
6255 <<
"SLP: mismatched arguments in call:" << *CI
6256 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
6257 return TreeEntry::NeedToGather;
6266 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
6267 <<
"!=" << *V <<
'\n');
6268 return TreeEntry::NeedToGather;
6272 return TreeEntry::Vectorize;
6274 case Instruction::ShuffleVector: {
6277 if (!S.isAltShuffle()) {
6278 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
6279 return TreeEntry::NeedToGather;
6284 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
6285 "the whole alt sequence is not profitable.\n");
6286 return TreeEntry::NeedToGather;
6289 return TreeEntry::Vectorize;
6293 return TreeEntry::NeedToGather;
6298 const EdgeInfo &UserTreeIdx) {
6304 auto TryToFindDuplicates = [&](
const InstructionsState &S,
6305 bool DoNotFail =
false) {
6308 for (
Value *V : VL) {
6315 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
6320 size_t NumUniqueScalarValues = UniqueValues.
size();
6321 if (NumUniqueScalarValues == VL.size()) {
6322 ReuseShuffleIndicies.
clear();
6325 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6326 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
6327 "for nodes with padding.\n");
6328 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6332 if (NumUniqueScalarValues <= 1 ||
6333 (UniquePositions.size() == 1 &&
all_of(UniqueValues,
6335 return isa<UndefValue>(V) ||
6338 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6339 if (DoNotFail && UniquePositions.size() > 1 &&
6340 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6342 return isa<ExtractElementInst>(V) ||
6343 areAllUsersVectorized(cast<Instruction>(V),
6347 if (PWSz == VL.size()) {
6348 ReuseShuffleIndicies.
clear();
6350 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
6351 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
6352 UniqueValues.
back());
6353 VL = NonUniqueValueVL;
6358 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6369 if (!EphValues.
empty()) {
6370 for (
Value *V : VL) {
6371 if (EphValues.
count(V)) {
6373 <<
") is ephemeral.\n");
6374 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6384 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6389 cast<Instruction>(
I)->getOpcode() ==
6390 cast<Instruction>(S.MainOp)->getOpcode();
6392 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
6393 if (TryToFindDuplicates(S))
6394 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6395 ReuseShuffleIndicies);
6400 if (S.getOpcode() == Instruction::ExtractElement &&
6401 isa<ScalableVectorType>(
6402 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6403 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
6404 if (TryToFindDuplicates(S))
6405 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6406 ReuseShuffleIndicies);
6411 if (S.OpValue->getType()->isVectorTy() &&
6412 !isa<InsertElementInst>(S.OpValue)) {
6414 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6418 if (
StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6419 if (
SI->getValueOperand()->getType()->isVectorTy()) {
6420 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to store vector type.\n");
6421 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6430 auto &&NotProfitableForVectorization = [&S,
this,
6432 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6441 for (
Value *V : VL) {
6442 auto *
I = cast<Instruction>(V);
6444 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6448 if ((IsCommutative &&
6449 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6451 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
6453 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
6455 auto *
I1 = cast<Instruction>(VL.front());
6456 auto *I2 = cast<Instruction>(VL.back());
6457 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6459 I2->getOperand(
Op));
6460 if (
static_cast<unsigned>(
count_if(
6461 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6463 })) >= S.MainOp->getNumOperands() / 2)
6465 if (S.MainOp->getNumOperands() > 2)
6467 if (IsCommutative) {
6470 for (
int Op = 0,
E = S.MainOp->getNumOperands();
Op <
E; ++
Op)
6472 I2->getOperand((
Op + 1) %
E));
6474 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
6483 bool IsScatterVectorizeUserTE =
6484 UserTreeIdx.UserTE &&
6485 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6486 bool AreAllSameInsts =
6488 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6492 auto *
I = dyn_cast<GetElementPtrInst>(V);
6496 BB =
I->getParent();
6497 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
6503 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6506 NotProfitableForVectorization(VL)) {
6507 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
6508 if (TryToFindDuplicates(S))
6509 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6510 ReuseShuffleIndicies);
6518 if (TreeEntry *
E = getTreeEntry(S.OpValue)) {
6519 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
6520 if (!
E->isSame(VL)) {
6521 auto It = MultiNodeScalars.
find(S.OpValue);
6522 if (It != MultiNodeScalars.
end()) {
6523 auto *TEIt =
find_if(It->getSecond(),
6524 [&](TreeEntry *ME) { return ME->isSame(VL); });
6525 if (TEIt != It->getSecond().end())
6535 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
6536 if (TryToFindDuplicates(S))
6537 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6538 ReuseShuffleIndicies);
6544 E->UserTreeIndices.push_back(UserTreeIdx);
6545 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
6552 for (
Value *V : VL) {
6553 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6556 if (getTreeEntry(V)) {
6558 <<
") is already in tree.\n");
6559 if (TryToFindDuplicates(S))
6560 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6561 ReuseShuffleIndicies);
6567 if (UserIgnoreList && !UserIgnoreList->empty()) {
6568 for (
Value *V : VL) {
6569 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6570 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
6571 if (TryToFindDuplicates(S))
6572 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6573 ReuseShuffleIndicies);
6581 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6582 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6584 assert(S.OpValue->getType()->isPointerTy() &&
6585 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6586 "Expected pointers only.");
6588 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
6589 assert(It != VL.end() &&
"Expected at least one GEP.");
6595 auto *VL0 = cast<Instruction>(S.OpValue);
6602 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6611 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
6616 if (!TryToFindDuplicates(S,
true))
6622 TreeEntry::EntryState State = getScalarsVectorizationState(
6623 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6624 if (State == TreeEntry::NeedToGather) {
6625 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6626 ReuseShuffleIndicies);
6630 auto &BSRef = BlocksSchedules[BB];
6632 BSRef = std::make_unique<BlockScheduling>(BB);
6634 BlockScheduling &BS = *BSRef;
6636 std::optional<ScheduleData *> Bundle =
6637 BS.tryScheduleBundle(UniqueValues,
this, S);
6638#ifdef EXPENSIVE_CHECKS
6643 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
6644 assert((!BS.getScheduleData(VL0) ||
6645 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6646 "tryScheduleBundle should cancelScheduling on failure");
6647 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
6648 ReuseShuffleIndicies);
6651 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
6653 unsigned ShuffleOrOp = S.isAltShuffle() ?
6654 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
6655 switch (ShuffleOrOp) {
6656 case Instruction::PHI: {
6657 auto *PH = cast<PHINode>(VL0);
6660 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6665 for (
unsigned I = 0,
E = PH->getNumIncomingValues();
I <
E; ++
I) {
6675 Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
6676 PH->getIncomingBlock(
I)));
6680 for (
unsigned OpIdx = 0, OpE = OperandsVec.
size(); OpIdx != OpE; ++OpIdx)
6681 buildTree_rec(OperandsVec[OpIdx],
Depth + 1, {
TE, OpIdx});
6684 case Instruction::ExtractValue:
6685 case Instruction::ExtractElement: {
6686 if (CurrentOrder.empty()) {
6687 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
6688 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6689 ReuseShuffleIndicies);
6693 Op0.
assign(VL.size(), VL0->getOperand(0));
6694 VectorizableTree.back()->setOperand(0, Op0);
6698 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
6700 for (
unsigned Idx : CurrentOrder)
6707 newTreeEntry(VL, Bundle , S, UserTreeIdx,
6708 ReuseShuffleIndicies, CurrentOrder);
6712 Op0.
assign(VL.size(), VL0->getOperand(0));
6713 VectorizableTree.back()->setOperand(0, Op0);
6716 case Instruction::InsertElement: {
6717 assert(ReuseShuffleIndicies.
empty() &&
"All inserts should be unique");
6719 auto OrdCompare = [](
const std::pair<int, int> &P1,
6720 const std::pair<int, int> &P2) {
6721 return P1.first > P2.first;
6724 decltype(OrdCompare)>
6725 Indices(OrdCompare);
6726 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6728 Indices.emplace(
Idx,
I);
6730 OrdersType CurrentOrder(VL.size(), VL.size());
6731 bool IsIdentity =
true;
6732 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
6733 CurrentOrder[Indices.top().second] =
I;
6734 IsIdentity &= Indices.top().second ==
I;
6738 CurrentOrder.clear();
6739 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6740 std::nullopt, CurrentOrder);
6743 constexpr int NumOps = 2;
6745 for (
int I = 0;
I < NumOps; ++
I) {
6747 VectorOperands[
I].
push_back(cast<Instruction>(V)->getOperand(
I));
6749 TE->setOperand(
I, VectorOperands[
I]);
6751 buildTree_rec(VectorOperands[NumOps - 1],
Depth + 1, {
TE, NumOps - 1});
6754 case Instruction::Load: {
6761 TreeEntry *
TE =
nullptr;
6764 case TreeEntry::Vectorize:
6765 if (CurrentOrder.empty()) {
6767 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6768 ReuseShuffleIndicies);
6772 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6773 ReuseShuffleIndicies, CurrentOrder);
6776 TE->setOperandsInOrder();
6778 case TreeEntry::StridedVectorize:
6780 if (CurrentOrder.empty()) {
6781 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6782 UserTreeIdx, ReuseShuffleIndicies);
6784 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6785 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6787 TE->setOperandsInOrder();
6790 case TreeEntry::ScatterVectorize:
6792 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6793 UserTreeIdx, ReuseShuffleIndicies);
6794 TE->setOperandsInOrder();
6795 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
6796 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
6798 case TreeEntry::NeedToGather:
6803 case Instruction::ZExt:
6804 case Instruction::SExt:
6805 case Instruction::FPToUI:
6806 case Instruction::FPToSI:
6807 case Instruction::FPExt:
6808 case Instruction::PtrToInt:
6809 case Instruction::IntToPtr:
6810 case Instruction::SIToFP:
6811 case Instruction::UIToFP:
6812 case Instruction::Trunc:
6813 case Instruction::FPTrunc:
6814 case Instruction::BitCast: {
6815 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6816 std::make_pair(std::numeric_limits<unsigned>::min(),
6817 std::numeric_limits<unsigned>::max()));
6818 if (ShuffleOrOp == Instruction::ZExt ||
6819 ShuffleOrOp == Instruction::SExt) {
6820 CastMaxMinBWSizes = std::make_pair(
6821 std::max<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6824 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6826 }
else if (ShuffleOrOp == Instruction::Trunc) {
6827 CastMaxMinBWSizes = std::make_pair(
6829 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
6831 std::min<unsigned>(
DL->getTypeSizeInBits(VL0->getType()),
6833 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6834 }
else if (ShuffleOrOp == Instruction::SIToFP ||
6835 ShuffleOrOp == Instruction::UIToFP) {
6836 unsigned NumSignBits =
6838 if (
auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
6840 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
6842 if (NumSignBits * 2 >=
6843 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6844 ExtraBitWidthNodes.
insert(VectorizableTree.size() + 1);
6846 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6847 ReuseShuffleIndicies);
6850 TE->setOperandsInOrder();
6851 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6855 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6861 case Instruction::ICmp:
6862 case Instruction::FCmp: {
6865 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6866 ReuseShuffleIndicies);
6874 "Commutative Predicate mismatch");
6875 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6878 for (
Value *V : VL) {
6879 auto *
Cmp = cast<CmpInst>(V);
6882 if (
Cmp->getPredicate() != P0)
6892 if (ShuffleOrOp == Instruction::ICmp) {
6893 unsigned NumSignBits0 =
6895 if (NumSignBits0 * 2 >=
6896 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
6897 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
6898 unsigned NumSignBits1 =
6900 if (NumSignBits1 * 2 >=
6901 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
6902 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
6906 case Instruction::Select:
6907 case Instruction::FNeg:
6908 case Instruction::Add:
6909 case Instruction::FAdd:
6910 case Instruction::Sub:
6911 case Instruction::FSub:
6912 case Instruction::Mul:
6913 case Instruction::FMul:
6914 case Instruction::UDiv:
6915 case Instruction::SDiv:
6916 case Instruction::FDiv:
6917 case Instruction::URem:
6918 case Instruction::SRem:
6919 case Instruction::FRem:
6920 case Instruction::Shl:
6921 case Instruction::LShr:
6922 case Instruction::AShr:
6923 case Instruction::And:
6924 case Instruction::Or:
6925 case Instruction::Xor: {
6926 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6927 ReuseShuffleIndicies);
6934 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
6942 TE->setOperandsInOrder();
6943 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
6947 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
6953 case Instruction::GetElementPtr: {
6954 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
6955 ReuseShuffleIndicies);
6959 for (
Value *V : VL) {
6960 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6965 Operands.front().push_back(
GEP->getPointerOperand());
6974 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
6976 [VL0Ty, IndexIdx](
Value *V) {
6977 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
6980 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
6983 :
DL->getIndexType(cast<GetElementPtrInst>(VL0)
6984 ->getPointerOperandType()
6987 for (
Value *V : VL) {
6988 auto *
I = dyn_cast<GetElementPtrInst>(V);
6991 ConstantInt::get(Ty, 0,
false));
6994 auto *
Op =
I->getOperand(IndexIdx);
6995 auto *CI = dyn_cast<ConstantInt>(
Op);
7000 CI, Ty, CI->getValue().isSignBitSet(), *
DL));
7004 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
7008 case Instruction::Store: {
7012 for (
Value *V : VL) {
7013 auto *
SI = cast<StoreInst>(V);
7014 *OIter =
SI->getValueOperand();
7018 if (CurrentOrder.empty()) {
7020 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7021 ReuseShuffleIndicies);
7022 TE->setOperandsInOrder();
7027 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7028 ReuseShuffleIndicies, CurrentOrder);
7029 TE->setOperandsInOrder();
7031 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
7035 case Instruction::Call: {
7038 CallInst *CI = cast<CallInst>(VL0);
7041 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7042 ReuseShuffleIndicies);
7047 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7051 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7055 for (
Value *V : VL) {
7056 auto *CI2 = cast<CallInst>(V);
7063 for (
unsigned I : seq<unsigned>(2, CI->
arg_size())) {
7070 TE->setOperandsInOrder();
7071 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
7078 for (
Value *V : VL) {
7079 auto *CI2 = cast<CallInst>(V);
7086 case Instruction::ShuffleVector: {
7087 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
7088 ReuseShuffleIndicies);
7092 auto *CI = dyn_cast<CmpInst>(VL0);
7093 if (isa<BinaryOperator>(VL0) || CI) {
7096 return cast<CmpInst>(V)->isCommutative();
7098 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
7100 auto *MainCI = cast<CmpInst>(S.MainOp);
7101 auto *AltCI = cast<CmpInst>(S.AltOp);
7105 "Expected different main/alternate predicates.");
7108 for (
Value *V : VL) {
7109 auto *
Cmp = cast<CmpInst>(V);
7131 TE->setOperandsInOrder();
7132 for (
unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7136 Operands.push_back(cast<Instruction>(V)->getOperand(
I));
7152 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7153 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
7155 for (
const auto *Ty : ST->elements())
7156 if (Ty != *ST->element_begin())
7158 N *= ST->getNumElements();
7159 EltTy = *ST->element_begin();
7160 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
7161 N *= AT->getNumElements();
7162 EltTy = AT->getElementType();
7164 auto *VT = cast<FixedVectorType>(EltTy);
7165 N *= VT->getNumElements();
7166 EltTy = VT->getElementType();
7173 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7174 VTSize !=
DL->getTypeStoreSizeInBits(
T))
7181 bool ResizeAllowed)
const {
7182 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7183 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
7184 auto *E0 = cast<Instruction>(*It);
7186 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7190 Value *Vec = E0->getOperand(0);
7192 CurrentOrder.
clear();
7196 if (E0->getOpcode() == Instruction::ExtractValue) {
7201 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7205 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
7208 unsigned E = VL.
size();
7209 if (!ResizeAllowed && NElts !=
E)
7212 unsigned MinIdx = NElts, MaxIdx = 0;
7214 auto *Inst = dyn_cast<Instruction>(V);
7217 if (Inst->getOperand(0) != Vec)
7219 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
7220 if (isa<UndefValue>(EE->getIndexOperand()))
7225 const unsigned ExtIdx = *
Idx;
7226 if (ExtIdx >= NElts)
7228 Indices[
I] = ExtIdx;
7229 if (MinIdx > ExtIdx)
7231 if (MaxIdx < ExtIdx)
7234 if (MaxIdx - MinIdx + 1 >
E)
7236 if (MaxIdx + 1 <=
E)
7240 bool ShouldKeepOrder =
true;
7247 for (
unsigned I = 0;
I <
E; ++
I) {
7250 const unsigned ExtIdx = Indices[
I] - MinIdx;
7251 if (CurrentOrder[ExtIdx] !=
E) {
7252 CurrentOrder.
clear();
7255 ShouldKeepOrder &= ExtIdx ==
I;
7256 CurrentOrder[ExtIdx] =
I;
7258 if (ShouldKeepOrder)
7259 CurrentOrder.
clear();
7261 return ShouldKeepOrder;
7264bool BoUpSLP::areAllUsersVectorized(
7266 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
7268 return ScalarToTreeEntry.contains(U) ||
7269 isVectorLikeInstWithConstOps(U) ||
7270 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7274static std::pair<InstructionCost, InstructionCost>
7282 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
7283 FMF = FPCI->getFastMathFlags();
7286 dyn_cast<IntrinsicInst>(CI));
7287 auto IntrinsicCost =
7294 auto LibCost = IntrinsicCost;
7301 return {IntrinsicCost, LibCost};
7304void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7308 unsigned Sz = Scalars.size();
7311 if (!ReorderIndices.empty())
7313 for (
unsigned I = 0;
I < Sz; ++
I) {
7315 if (!ReorderIndices.empty())
7317 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
7318 if (IsAltOp(OpInst)) {
7328 if (!ReuseShuffleIndices.empty()) {
7331 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7341 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7342 auto *AltCI = cast<CmpInst>(AltOp);
7345 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
7346 auto *CI = cast<CmpInst>(
I);
7354 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
7355 "CmpInst expected to match either main or alternate predicate or "
7358 return MainP !=
P && MainP != SwappedP;
7365 const auto *Op0 = Ops.
front();
7371 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
7375 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
7377 if (
auto *CI = dyn_cast<ConstantInt>(V))
7378 return CI->getValue().isPowerOf2();
7381 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
7383 if (
auto *CI = dyn_cast<ConstantInt>(V))
7384 return CI->getValue().isNegatedPowerOf2();
7389 if (IsConstant && IsUniform)
7391 else if (IsConstant)
7405class BaseShuffleAnalysis {
7412 int Limit =
Mask.size();
7424 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
7440 unsigned VF =
Mask.size();
7442 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
7445 int MaskedIdx =
Mask[ExtMask[
I] % VF];
7486 bool SinglePermute) {
7490 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
7492 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7498 if (isIdentityMask(Mask, SVTy,
false)) {
7499 if (!IdentityOp || !SinglePermute ||
7500 (isIdentityMask(Mask, SVTy,
true) &&
7502 IdentityMask.
size()))) {
7507 IdentityMask.
assign(Mask);
7527 if (SV->isZeroEltSplat()) {
7529 IdentityMask.
assign(Mask);
7531 int LocalVF =
Mask.size();
7533 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7534 LocalVF = SVOpTy->getNumElements();
7538 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
7540 ExtMask[
Idx] = SV->getMaskValue(
I);
7550 if (!IsOp1Undef && !IsOp2Undef) {
7552 for (
int &
I : Mask) {
7555 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
7562 SV->getShuffleMask().end());
7563 combineMasks(LocalVF, ShuffleMask, Mask);
7564 Mask.swap(ShuffleMask);
7566 Op = SV->getOperand(0);
7568 Op = SV->getOperand(1);
7570 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
7571 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7576 "Expected masks of same sizes.");
7581 Mask.swap(IdentityMask);
7582 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7583 return SinglePermute &&
7584 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
7586 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
7587 Shuffle->isZeroEltSplat() &&
7600 template <
typename T,
typename ShuffleBuilderTy>
7602 ShuffleBuilderTy &Builder) {
7603 assert(V1 &&
"Expected at least one vector value.");
7605 Builder.resizeToMatch(V1, V2);
7606 int VF =
Mask.size();
7607 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
7608 VF = FTy->getNumElements();
7615 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
7618 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7620 CombinedMask1[
I] =
Mask[
I];
7622 CombinedMask2[
I] =
Mask[
I] - VF;
7629 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
7630 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
7633 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7634 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7639 ExtMask1[
Idx] = SV1->getMaskValue(
I);
7642 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7644 ExtMask1, UseMask::SecondArg);
7649 ExtMask2[
Idx] = SV2->getMaskValue(
I);
7652 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7654 ExtMask2, UseMask::SecondArg);
7655 if (SV1->getOperand(0)->getType() ==
7656 SV2->getOperand(0)->getType() &&
7657 SV1->getOperand(0)->getType() != SV1->getType() &&
7660 Op1 = SV1->getOperand(0);
7661 Op2 = SV2->getOperand(0);
7663 SV1->getShuffleMask().end());
7664 int LocalVF = ShuffleMask1.size();
7665 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
7666 LocalVF = FTy->getNumElements();
7667 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7668 CombinedMask1.swap(ShuffleMask1);
7670 SV2->getShuffleMask().end());
7671 LocalVF = ShuffleMask2.size();
7672 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
7673 LocalVF = FTy->getNumElements();
7674 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7675 CombinedMask2.swap(ShuffleMask2);
7678 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
7679 Builder.resizeToMatch(Op1, Op2);
7680 VF = std::max(cast<VectorType>(Op1->
getType())
7682 .getKnownMinValue(),
7683 cast<VectorType>(Op2->
getType())
7685 .getKnownMinValue());
7686 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
7689 "Expected undefined mask element");
7690 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
7696 isa<ShuffleVectorInst>(Op1) &&
7697 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7699 return Builder.createIdentity(Op1);
7700 return Builder.createShuffleVector(
7704 if (isa<PoisonValue>(V1))
7705 return Builder.createPoison(
7706 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
7708 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
7709 assert(V1 &&
"Expected non-null value after looking through shuffles.");
7712 return Builder.createShuffleVector(V1, NewMask);
7713 return Builder.createIdentity(V1);
7729 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7732 Mask, NumSrcElts, NumSubElts,
Index)) {
7733 if (
Index + NumSubElts > NumSrcElts &&
7734 Index + NumSrcElts <=
static_cast<int>(Mask.size()))
7744static std::pair<InstructionCost, InstructionCost>
7755 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7765 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7769 for (
Value *V : Ptrs) {
7774 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7779 if (!
Ptr || !
Ptr->hasOneUse())
7783 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
7789 TTI::PointersChainInfo::getKnownStride(),
7799 [](
const Value *V) {
7800 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
7801 return Ptr && !
Ptr->hasAllConstantIndices();
7803 ? TTI::PointersChainInfo::getUnknownStride()
7804 : TTI::PointersChainInfo::getKnownStride();
7808 if (
auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7811 BaseGEP->getPointerOperand(), Indices, VecTy,
7816 return std::make_pair(ScalarCost, VecCost);
7821 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7822 TreeEntry &E = *TE.get();
7823 switch (E.getOpcode()) {
7824 case Instruction::Load: {
7825 Type *ScalarTy = E.getMainOp()->getType();
7827 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7834 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7841 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7842 false, CommonAlignment,
CostKind, BaseLI);
7843 if (StridedCost < OriginalVecCost)
7846 E.State = TreeEntry::StridedVectorize;
7863 bool IsFinalized =
false;
7876 bool SameNodesEstimated =
true;
7885 if (
auto *VTy = dyn_cast<VectorType>(Ty))
7901 const unsigned Sz = R.DL->getTypeSizeInBits(VL.
front()->getType());
7902 unsigned MinVF = R.getMinVF(2 * Sz);
7903 if (VL.
size() > 2 &&
7904 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7905 (InVectors.
empty() &&
7908 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
7909 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
7910 return S.getOpcode() == Instruction::Load &&
7913 !
all_of(Gathers, [&](
Value *V) {
return R.getTreeEntry(V); }) &&
7919 unsigned StartIdx = 0;
7920 unsigned VF = VL.
size() / 2;
7921 for (; VF >= MinVF; VF /= 2) {
7922 for (
unsigned Cnt = StartIdx,
End = VL.
size(); Cnt + VF <=
End;
7925 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7927 if (SliceS.getOpcode() != Instruction::Load ||
7928 SliceS.isAltShuffle())
7936 CurrentOrder, PointerOps);
7946 CurrentOrder.
empty()) ||
7955 if (Cnt == StartIdx)
7964 if (StartIdx >= VL.
size())
7967 if (!VectorizedLoads.
empty())
7970 if (!VectorizedLoads.
empty()) {
7972 bool NeedInsertSubvectorAnalysis =
7973 !NumParts || (VL.
size() / VF) > NumParts;
7979 getBuildVectorCost(VL.
slice(
I, std::min(
End -
I, VF)), Root);
7986 for (
Value *V : VectorizedLoads) {
7987 auto *LI = cast<LoadInst>(V);
7994 for (
const std::pair<unsigned, LoadsState> &
P : VectorizedStarts) {
7995 auto *LI = cast<LoadInst>(VL[
P.first]);
8004 false, Alignment, CostKind, LI);
8008 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8009 auto [ScalarGEPCost, VectorGEPCost] =
8011 Instruction::Load, CostKind, LI->
getType(), LoadTy);
8012 GatherCost += VectorGEPCost - ScalarGEPCost;
8014 for (
unsigned P : ScatterVectorized) {
8015 auto *LI0 = cast<LoadInst>(VL[
P]);
8017 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8019 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8020 false, CommonAlignment, CostKind, LI0);
8024 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
8032 auto [ScalarGEPCost, VectorGEPCost] =
8034 CostKind, ScalarTy, VecTy);
8035 GatherCost += VectorGEPCost - ScalarGEPCost;
8036 if (!Order.
empty()) {
8040 VecTy, Mask, CostKind);
8043 GatherCost += R.getGatherCost(PointerOps,
true);
8046 if (NeedInsertSubvectorAnalysis) {
8049 for (
unsigned I = VF, E = VL.
size();
I < E;
I += VF) {
8050 for (
unsigned Idx : seq<unsigned>(0, E))
8053 ShuffleMask, CostKind,
I, LoadTy);
8056 GatherCost -= ScalarsCost;
8058 GatherCost = std::min(BaseCost, GatherCost);
8059 }
else if (!Root &&
isSplat(VL)) {
8062 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
8063 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
8066 count(VL, *It) > 1 &&
8070 CostKind, std::distance(VL.
begin(), It),
8075 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8078 Instruction::InsertElement, VecTy, CostKind, 0,
8082 ShuffleMask, CostKind, 0,
8086 (
all_of(Gathers, IsaPred<UndefValue>)
8088 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers)));
8095 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8096 unsigned NumParts) {
8097 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
8099 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
8100 auto *EE = dyn_cast<ExtractElementInst>(V);
8103 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8106 return std::max(Sz, VecTy->getNumElements());
8110 if (NumSrcRegs == 0)
8115 auto CheckPerRegistersShuffle =
8120 int FirstRegId = -1;
8121 for (
int &
I : Mask) {
8124 int RegId = (
I / NumElts) * NumParts + (
I % NumElts) / EltsPerVector;
8127 RegIndices.
insert(RegId);
8128 if (RegIndices.
size() > 2)
8129 return std::nullopt;
8130 if (RegIndices.
size() == 2)
8132 I = (
I % NumElts) % EltsPerVector +
8133 (RegId == FirstRegId ? 0 : EltsPerVector);
8142 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8143 if (!ShuffleKinds[Part])
8146 Mask.slice(Part * EltsPerVector,
8147 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8148 ? Mask.size() % EltsPerVector
8152 std::optional<TTI::ShuffleKind> RegShuffleKind =
8153 CheckPerRegistersShuffle(SubMask);
8154 if (!RegShuffleKind) {
8156 TTI, *ShuffleKinds[Part],
8163 TTI, *RegShuffleKind,
8174 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8181 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
8183 unsigned SliceSize) {
8184 if (SameNodesEstimated) {
8190 if ((InVectors.
size() == 2 &&
8191 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
8192 InVectors.
back().get<
const TreeEntry *>() == E2) ||
8193 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
8196 "Expected all poisoned elements.");
8199 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
8204 Cost += createShuffle(InVectors.
front(),
8205 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
8207 transformMaskAfterShuffle(CommonMask, CommonMask);
8209 SameNodesEstimated =
false;
8210 if (!E2 && InVectors.
size() == 1) {
8211 unsigned VF = E1.getVectorFactor();
8214 cast<FixedVectorType>(V1->
getType())->getNumElements());
8216 const auto *E = InVectors.
front().get<
const TreeEntry *>();
8217 VF = std::max(VF, E->getVectorFactor());
8219 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8221 CommonMask[
Idx] = Mask[
Idx] + VF;
8222 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
8223 transformMaskAfterShuffle(CommonMask, CommonMask);
8225 Cost += createShuffle(&E1, E2, Mask);
8226 transformMaskAfterShuffle(CommonMask, Mask);
8230 class ShuffleCostBuilder {
8233 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
8235 return Mask.empty() ||
8236 (VF == Mask.size() &&
8244 ~ShuffleCostBuilder() =
default;
8249 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8250 if (isEmptyOrIdentity(Mask, VF))
8253 cast<VectorType>(V1->
getType()), Mask);
8258 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
8259 if (isEmptyOrIdentity(Mask, VF))
8262 cast<VectorType>(V1->
getType()), Mask);
8268 void resizeToMatch(
Value *&,
Value *&)
const {}
8278 ShuffleCostBuilder Builder(
TTI);
8281 unsigned CommonVF = Mask.size();
8282 if (!V1 && !V2 && !P2.
isNull()) {
8284 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8285 unsigned VF = E->getVectorFactor();
8286 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8287 CommonVF = std::max(VF, E2->getVectorFactor());
8290 return Idx < 2 * static_cast<int>(CommonVF);
8292 "All elements in mask must be less than 2 * CommonVF.");
8293 if (E->Scalars.size() == E2->Scalars.size()) {
8297 for (
int &
Idx : CommonMask) {
8300 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
8302 else if (
Idx >=
static_cast<int>(CommonVF))
8303 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
8307 CommonVF = E->Scalars.size();
8311 V2 = getAllOnesValue(
8313 }
else if (!V1 && P2.
isNull()) {
8315 const TreeEntry *E = P1.
get<
const TreeEntry *>();
8316 unsigned VF = E->getVectorFactor();
8320 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8321 "All elements in mask must be less than CommonVF.");
8322 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8324 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
8325 for (
int &
Idx : CommonMask) {
8329 CommonVF = E->Scalars.size();
8334 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8335 CommonVF == CommonMask.
size() &&
8337 [](
const auto &&
P) {
8339 static_cast<unsigned>(
P.value()) !=
P.index();
8347 }
else if (V1 && P2.
isNull()) {
8349 CommonVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8352 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
8353 "All elements in mask must be less than CommonVF.");
8354 }
else if (V1 && !V2) {
8356 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8357 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
8358 CommonVF = std::max(VF, E2->getVectorFactor());
8361 return Idx < 2 * static_cast<int>(CommonVF);
8363 "All elements in mask must be less than 2 * CommonVF.");
8364 if (E2->Scalars.size() == VF && VF != CommonVF) {
8366 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
8367 for (
int &
Idx : CommonMask) {
8370 if (
Idx >=
static_cast<int>(CommonVF))
8371 Idx = E2Mask[
Idx - CommonVF] + VF;
8377 V2 = getAllOnesValue(
8380 }
else if (!V1 && V2) {
8382 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8383 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
8384 CommonVF = std::max(VF, E1->getVectorFactor());
8387 return Idx < 2 * static_cast<int>(CommonVF);
8389 "All elements in mask must be less than 2 * CommonVF.");
8390 if (E1->Scalars.size() == VF && VF != CommonVF) {
8392 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
8393 for (
int &
Idx : CommonMask) {
8396 if (
Idx >=
static_cast<int>(CommonVF))
8397 Idx = E1Mask[
Idx - CommonVF] + VF;
8405 V2 = getAllOnesValue(
8409 assert(V1 && V2 &&
"Expected both vectors.");
8410 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8412 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8415 return Idx < 2 * static_cast<int>(CommonVF);
8417 "All elements in mask must be less than 2 * CommonVF.");
8418 if (V1->
getType() != V2->getType()) {
8420 cast<FixedVectorType>(V1->
getType())->getElementType(), CommonVF));
8421 V2 = getAllOnesValue(
8423 cast<FixedVectorType>(V1->
getType())->getElementType(),
8428 cast<FixedVectorType>(V1->
getType())->getElementType(),
8429 CommonMask.
size()));
8430 if (InVectors.
size() == 2)
8432 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8433 V1, V2, CommonMask, Builder);
8440 :
TTI(
TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8441 R(R), CheckedExtracts(CheckedExtracts) {}
8443 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8444 unsigned NumParts,
bool &UseVecBaseAsInput) {
8445 UseVecBaseAsInput =
false;
8448 Value *VecBase =
nullptr;
8451 if (NumParts == VL.
size())
8455 bool PrevNodeFound =
any_of(
8457 [&](
const std::unique_ptr<TreeEntry> &TE) {
8458 return ((!TE->isAltShuffle() &&
8459 TE->getOpcode() == Instruction::ExtractElement) ||
8460 TE->State == TreeEntry::NeedToGather) &&
8461 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8462 return VL.size() > Data.index() &&
8463 (Mask[Data.index()] == PoisonMaskElem ||
8464 isa<UndefValue>(VL[Data.index()]) ||
8465 Data.value() == VL[Data.index()]);
8469 unsigned SliceSize = VL.
size() / NumParts;
8470 for (
unsigned Part = 0; Part < NumParts; ++Part) {
8471 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8472 for (
auto [
I, V] :
enumerate(VL.
slice(Part * SliceSize, SliceSize))) {
8474 if (isa<UndefValue>(V) ||
8483 auto *EE = cast<ExtractElementInst>(V);
8484 VecBase = EE->getVectorOperand();
8485 UniqueBases.
insert(VecBase);
8486 const TreeEntry *VE = R.getTreeEntry(V);
8487 if (!CheckedExtracts.
insert(V).second ||
8488 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8494 unsigned Idx = *EEIdx;
8496 if (EE->hasOneUse() || !PrevNodeFound) {
8498 if (isa<SExtInst, ZExtInst>(Ext) &&
8499 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8504 EE->getVectorOperandType(),
Idx);
8507 Ext->getOpcode(), Ext->getType(), EE->getType(),
8523 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8526 transformMaskAfterShuffle(CommonMask, CommonMask);
8527 SameNodesEstimated =
false;
8528 if (NumParts != 1 && UniqueBases.
size() != 1) {
8529 UseVecBaseAsInput =
true;
8537 std::optional<InstructionCost>
8541 return std::nullopt;
8547 return Idx < static_cast<int>(E1.getVectorFactor());
8549 "Expected single vector shuffle mask.");
8553 if (InVectors.
empty()) {
8554 CommonMask.
assign(Mask.begin(), Mask.end());
8555 InVectors.
assign({&E1, &E2});
8558 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8562 if (NumParts == 0 || NumParts >= Mask.size())
8564 unsigned SliceSize = Mask.size() / NumParts;
8567 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8568 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8571 if (InVectors.
empty()) {
8572 CommonMask.
assign(Mask.begin(), Mask.end());
8573 InVectors.
assign(1, &E1);
8576 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
8580 if (NumParts == 0 || NumParts >= Mask.size())
8582 unsigned SliceSize = Mask.size() / NumParts;
8585 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8586 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
8587 if (!SameNodesEstimated && InVectors.
size() == 1)
8600 cast<ExtractElementInst>(InVectors.
front()
8601 .get<
const TreeEntry *>()
8602 ->Scalars[
P.index()]);
8603 return EI->getVectorOperand() == V1 ||
8604 EI->getVectorOperand() == V2;
8606 "Expected extractelement vectors.");
8610 if (InVectors.
empty()) {
8612 "Expected empty input mask/vectors.");
8613 CommonMask.
assign(Mask.begin(), Mask.end());
8620 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
8624 .get<const TreeEntry *>()
8625 ->Scalars[
P.index()];
8627 return P.value() == Mask[
P.index()] ||
8628 isa<UndefValue>(Scalar);
8629 if (isa<Constant>(V1))
8631 auto *EI = cast<ExtractElementInst>(Scalar);
8632 return EI->getVectorOperand() == V1;
8634 "Expected only tree entry for extractelement vectors.");
8638 "Expected only tree entries from extracts/reused buildvectors.");
8639 unsigned VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
8640 if (InVectors.
size() == 2) {
8641 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
8642 transformMaskAfterShuffle(CommonMask, CommonMask);
8643 VF = std::max<unsigned>(VF, CommonMask.
size());
8644 }
else if (
const auto *InTE =
8645 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
8646 VF = std::max(VF, InTE->getVectorFactor());
8650 ->getNumElements());
8653 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8655 CommonMask[
Idx] = Mask[
Idx] + VF;
8658 Value *Root =
nullptr) {
8659 Cost += getBuildVectorCost(VL, Root);
8663 unsigned VF = VL.
size();
8665 VF = std::min(VF, MaskVF);
8667 if (isa<UndefValue>(V)) {
8677 cast<FixedVectorType>(Root->
getType())->getNumElements()),
8678 getAllOnesValue(*R.DL, VL.
front()->getType()));
8688 if (InVectors.
size() == 2)
8689 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
8691 Cost += createShuffle(Vec,
nullptr, CommonMask);
8692 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
8696 "Expected vector length for the final value before action.");
8698 Action(V, CommonMask);
8699 InVectors.
front() = V;
8702 if (CommonMask.
empty()) {
8703 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
8707 createShuffle(InVectors.
front(),
8708 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
8714 "Shuffle construction must be finalized.");
8718const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
8719 unsigned Idx)
const {
8721 if (
const TreeEntry *TE = getTreeEntry(
Op)) {
8722 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8723 return EI.EdgeIdx == Idx && EI.UserTE == E;
8724 }) != TE->UserTreeIndices.end())
8726 auto MIt = MultiNodeScalars.
find(
Op);
8727 if (MIt != MultiNodeScalars.
end()) {
8728 for (
const TreeEntry *TE : MIt->second) {
8729 if (
find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8730 return EI.EdgeIdx == Idx && EI.UserTE == E;
8731 }) != TE->UserTreeIndices.end())
8737 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8738 return TE->State == TreeEntry::NeedToGather &&
8739 find_if(
TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
8740 return EI.EdgeIdx == Idx && EI.UserTE == E;
8741 }) !=
TE->UserTreeIndices.end();
8743 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
8748 if (
TE.State == TreeEntry::ScatterVectorize ||
8749 TE.State == TreeEntry::StridedVectorize)
8751 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
8752 !
TE.isAltShuffle()) {
8753 if (
TE.ReorderIndices.empty())
8792 Type *ScalarTy = VL[0]->getType();
8793 if (E->State != TreeEntry::NeedToGather) {
8794 if (
auto *SI = dyn_cast<StoreInst>(VL[0]))
8795 ScalarTy =
SI->getValueOperand()->getType();
8796 else if (
auto *CI = dyn_cast<CmpInst>(VL[0]))
8798 else if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
8799 ScalarTy =
IE->getOperand(1)->getType();
8808 auto It = MinBWs.
find(E);
8809 Type *OrigScalarTy = ScalarTy;
8810 if (It != MinBWs.
end()) {
8814 unsigned EntryVF = E->getVectorFactor();
8817 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8818 if (E->State == TreeEntry::NeedToGather) {
8821 if (isa<InsertElementInst>(VL[0]))
8823 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8824 E, *
TTI, VectorizedVals, *
this, CheckedExtracts);
8829 if (!E->ReorderIndices.empty() &&
8830 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8832 if (E->getOpcode() == Instruction::Store) {
8834 NewMask.
resize(E->ReorderIndices.size());
8835 copy(E->ReorderIndices, NewMask.
begin());
8841 if (NeedToShuffleReuses)
8842 ::addMask(Mask, E->ReuseShuffleIndices);
8846 assert((E->State == TreeEntry::Vectorize ||
8847 E->State == TreeEntry::ScatterVectorize ||
8848 E->State == TreeEntry::StridedVectorize) &&
8852 (E->getOpcode() == Instruction::GetElementPtr &&
8853 E->getMainOp()->getType()->isPointerTy())) &&
8856 unsigned ShuffleOrOp =
8857 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
8859 const unsigned Sz = UniqueValues.
size();
8861 for (
unsigned I = 0;
I < Sz; ++
I) {
8862 if (getTreeEntry(UniqueValues[
I]) == E)
8866 auto GetCastContextHint = [&](
Value *
V) {
8867 if (
const TreeEntry *OpTE = getTreeEntry(V))
8868 return getCastContextHint(*OpTE);
8869 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
8870 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8879 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
8883 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8885 for (
unsigned I = 0;
I < Sz; ++
I) {
8886 if (UsedScalars.test(
I))
8888 ScalarCost += ScalarEltCost(
I);
8896 const EdgeInfo &EI = E->UserTreeIndices.front();
8897 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8899 It != MinBWs.
end()) {
8900 auto UserBWIt = MinBWs.
find(EI.UserTE);
8901 Type *UserScalarTy =
8902 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
8903 if (UserBWIt != MinBWs.
end())
8905 UserBWIt->second.first);
8906 if (ScalarTy != UserScalarTy) {
8907 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
8908 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
8913 VecOpcode = Instruction::Trunc;
8916 It->second.second ? Instruction::SExt : Instruction::ZExt;
8923 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8924 ScalarCost,
"Calculated costs for Tree"));
8925 return VecCost - ScalarCost;
8930 assert((E->State == TreeEntry::Vectorize ||
8931 E->State == TreeEntry::StridedVectorize) &&
8932 "Entry state expected to be Vectorize or StridedVectorize here.");
8936 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
8937 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8938 "Calculated GEPs cost for Tree"));
8940 return VecCost - ScalarCost;
8943 switch (ShuffleOrOp) {
8944 case Instruction::PHI: {
8948 for (
Value *V : UniqueValues) {
8949 auto *
PHI = dyn_cast<PHINode>(V);
8954 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
8958 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
8960 if (!OpTE->ReuseShuffleIndices.empty())
8961 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8962 OpTE->Scalars.size());
8965 return CommonCost - ScalarCost;
8967 case Instruction::ExtractValue:
8968 case Instruction::ExtractElement: {
8969 auto GetScalarCost = [&](
unsigned Idx) {
8970 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
8972 if (ShuffleOrOp == Instruction::ExtractElement) {
8973 auto *EE = cast<ExtractElementInst>(
I);
8974 SrcVecTy = EE->getVectorOperandType();
8976 auto *EV = cast<ExtractValueInst>(
I);
8977 Type *AggregateTy = EV->getAggregateOperand()->getType();
8979 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
8980 NumElts = ATy->getNumElements();
8985 if (
I->hasOneUse()) {
8987 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
8988 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
8995 Ext->getOpcode(),
Ext->getType(),
I->getType(),
9003 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
9004 return GetCostDiff(GetScalarCost, GetVectorCost);
9006 case Instruction::InsertElement: {
9007 assert(E->ReuseShuffleIndices.empty() &&
9008 "Unique insertelements only are expected.");
9009 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
9010 unsigned const NumElts = SrcVecTy->getNumElements();
9011 unsigned const NumScalars = VL.
size();
9017 unsigned OffsetEnd = OffsetBeg;
9018 InsertMask[OffsetBeg] = 0;
9021 if (OffsetBeg >
Idx)
9023 else if (OffsetEnd <
Idx)
9025 InsertMask[
Idx] =
I + 1;
9029 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9030 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9032 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9033 unsigned InsertVecSz = std::min<unsigned>(
9035 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9036 bool IsWholeSubvector =
9037 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9041 if (OffsetBeg + InsertVecSz > VecSz) {
9044 InsertVecSz = VecSz;
9050 if (!E->ReorderIndices.empty()) {
9055 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
9057 bool IsIdentity =
true;
9059 Mask.swap(PrevMask);
9060 for (
unsigned I = 0;
I < NumScalars; ++
I) {
9062 DemandedElts.
setBit(InsertIdx);
9063 IsIdentity &= InsertIdx - OffsetBeg ==
I;
9064 Mask[InsertIdx - OffsetBeg] =
I;
9066 assert(
Offset < NumElts &&
"Failed to find vector index offset");
9081 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
9082 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9090 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9091 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
9092 if (InsertVecSz != VecSz) {
9104 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
9113 case Instruction::ZExt:
9114 case Instruction::SExt:
9115 case Instruction::FPToUI:
9116 case Instruction::FPToSI:
9117 case Instruction::FPExt:
9118 case Instruction::PtrToInt:
9119 case Instruction::IntToPtr:
9120 case Instruction::SIToFP:
9121 case Instruction::UIToFP:
9122 case Instruction::Trunc:
9123 case Instruction::FPTrunc:
9124 case Instruction::BitCast: {
9125 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9128 unsigned Opcode = ShuffleOrOp;
9129 unsigned VecOpcode = Opcode;
9131 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
9133 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
9134 if (SrcIt != MinBWs.
end()) {
9135 SrcBWSz = SrcIt->second.first;
9139 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9140 if (BWSz == SrcBWSz) {
9141 VecOpcode = Instruction::BitCast;
9142 }
else if (BWSz < SrcBWSz) {
9143 VecOpcode = Instruction::Trunc;
9144 }
else if (It != MinBWs.
end()) {
9145 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9146 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9147 }
else if (SrcIt != MinBWs.
end()) {
9148 assert(BWSz > SrcBWSz &&
"Invalid cast!");
9150 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9152 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
9153 !SrcIt->second.second) {
9154 VecOpcode = Instruction::UIToFP;
9157 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9165 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9167 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
9171 VecOpcode == Opcode ? VI :
nullptr);
9173 return GetCostDiff(GetScalarCost, GetVectorCost);
9175 case Instruction::FCmp:
9176 case Instruction::ICmp:
9177 case Instruction::Select: {
9181 match(VL0, MatchCmp))
9187 auto GetScalarCost = [&](
unsigned Idx) {
9188 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9194 !
match(VI, MatchCmp)) ||
9195 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9201 Builder.getInt1Ty(), CurrentPred,
CostKind,
9208 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind, VL0);
9220 if (IntrinsicAndUse.second)
9223 VecCost = std::min(VecCost, IntrinsicCost);
9225 return VecCost + CommonCost;
9227 return GetCostDiff(GetScalarCost, GetVectorCost);
9229 case Instruction::FNeg:
9230 case Instruction::Add:
9231 case Instruction::FAdd:
9232 case Instruction::Sub:
9233 case Instruction::FSub:
9234 case Instruction::Mul:
9235 case Instruction::FMul:
9236 case Instruction::UDiv:
9237 case Instruction::SDiv:
9238 case Instruction::FDiv:
9239 case Instruction::URem:
9240 case Instruction::SRem:
9241 case Instruction::FRem:
9242 case Instruction::Shl:
9243 case Instruction::LShr:
9244 case Instruction::AShr:
9245 case Instruction::And:
9246 case Instruction::Or:
9247 case Instruction::Xor: {
9248 auto GetScalarCost = [&](
unsigned Idx) {
9249 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9250 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9259 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9263 Op2Info, std::nullopt,
nullptr, TLI) +
9266 return GetCostDiff(GetScalarCost, GetVectorCost);
9268 case Instruction::GetElementPtr: {
9269 return CommonCost + GetGEPCostDiff(VL, VL0);
9271 case Instruction::Load: {
9272 auto GetScalarCost = [&](
unsigned Idx) {
9273 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
9275 VI->getAlign(),
VI->getPointerAddressSpace(),
9278 auto *LI0 = cast<LoadInst>(VL0);
9281 if (E->State == TreeEntry::Vectorize) {
9283 Instruction::Load, VecTy, LI0->getAlign(),
9285 }
else if (E->State == TreeEntry::StridedVectorize) {
9286 Align CommonAlignment =
9287 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9289 Instruction::Load, VecTy, LI0->getPointerOperand(),
9292 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
9293 Align CommonAlignment =
9294 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9296 Instruction::Load, VecTy, LI0->getPointerOperand(),
9299 return VecLdCost + CommonCost;
9305 if (E->State == TreeEntry::ScatterVectorize)
9311 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
9312 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9314 case Instruction::Store: {
9315 bool IsReorder = !E->ReorderIndices.empty();
9316 auto GetScalarCost = [=](
unsigned Idx) {
9317 auto *
VI = cast<StoreInst>(VL[
Idx]);
9320 VI->getAlign(),
VI->getPointerAddressSpace(),
9324 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9329 BaseSI->getPointerAddressSpace(),
CostKind,
9335 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
9336 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
9339 return GetCostDiff(GetScalarCost, GetVectorCost) +
9340 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9342 case Instruction::Call: {
9343 auto GetScalarCost = [&](
unsigned Idx) {
9344 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
9355 auto *CI = cast<CallInst>(VL0);
9359 It != MinBWs.
end() ? It->second.first : 0);
9361 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9363 return GetCostDiff(GetScalarCost, GetVectorCost);
9365 case Instruction::ShuffleVector: {
9366 assert(E->isAltShuffle() &&
9371 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9372 "Invalid Shuffle Vector Operand");
9375 auto TryFindNodeWithEqualOperands = [=]() {
9376 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9379 if (
TE->isAltShuffle() &&
9380 ((
TE->getOpcode() == E->getOpcode() &&
9381 TE->getAltOpcode() == E->getAltOpcode()) ||
9382 (
TE->getOpcode() == E->getAltOpcode() &&
9383 TE->getAltOpcode() == E->getOpcode())) &&
9384 TE->hasEqualOperands(*E))
9389 auto GetScalarCost = [&](
unsigned Idx) {
9390 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
9391 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
9401 if (TryFindNodeWithEqualOperands()) {
9403 dbgs() <<
"SLP: diamond match for alternate node found.\n";
9410 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
9412 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
9413 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9415 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9416 CI0->getPredicate(),
CostKind, VL0);
9417 VecCost += TTIRef.getCmpSelInstrCost(
9418 E->getOpcode(), VecTy, MaskTy,
9419 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
9422 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9425 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
9426 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
9428 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9429 if (SrcIt != MinBWs.
end()) {
9430 SrcBWSz = SrcIt->second.first;
9434 if (BWSz <= SrcBWSz) {
9437 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9441 <<
"SLP: alternate extension, which should be truncated.\n";
9447 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9450 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9454 E->buildAltOpShuffleMask(
9456 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
9457 return I->getOpcode() == E->getAltOpcode();
9466 unsigned Opcode0 = E->getOpcode();
9467 unsigned Opcode1 = E->getAltOpcode();
9470 for (
unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9471 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9472 OpcodeMask.set(Lane);
9475 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9477 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
9478 return AltVecCost < VecCost ? AltVecCost : VecCost;
9483 return GetCostDiff(GetScalarCost, GetVectorCost);
9490bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
9492 << VectorizableTree.size() <<
" is fully vectorizable .\n");
9494 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
9496 return TE->State == TreeEntry::NeedToGather &&
9498 [
this](
Value *V) { return EphValues.contains(V); }) &&
9500 TE->Scalars.size() < Limit ||
9501 ((
TE->getOpcode() == Instruction::ExtractElement ||
9502 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9504 (
TE->State == TreeEntry::NeedToGather &&
9505 TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()));
9509 if (VectorizableTree.size() == 1 &&
9510 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9512 AreVectorizableGathers(VectorizableTree[0].
get(),
9513 VectorizableTree[0]->Scalars.size()) &&
9514 VectorizableTree[0]->getVectorFactor() > 2)))
9517 if (VectorizableTree.size() != 2)
9525 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9526 AreVectorizableGathers(VectorizableTree[1].
get(),
9527 VectorizableTree[0]->Scalars.size()))
9531 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9532 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9533 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9534 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9542 bool MustMatchOrInst) {
9546 Value *ZextLoad = Root;
9547 const APInt *ShAmtC;
9548 bool FoundOr =
false;
9549 while (!isa<ConstantExpr>(ZextLoad) &&
9552 ShAmtC->
urem(8) == 0))) {
9553 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9554 ZextLoad = BinOp->getOperand(0);
9555 if (BinOp->getOpcode() == Instruction::Or)
9560 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9567 Type *SrcTy = Load->getType();
9574 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
9575 << *(cast<Instruction>(Root)) <<
"\n");
9584 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9585 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9593 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9594 for (
Value *Scalar : VectorizableTree[0]->Scalars) {
9605 if (VectorizableTree.size() == 2 &&
9606 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9607 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9608 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9609 !(
isSplat(VectorizableTree[1]->Scalars) ||
9617 constexpr int Limit = 4;
9619 !VectorizableTree.empty() &&
9620 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9621 return (TE->State == TreeEntry::NeedToGather &&
9622 TE->getOpcode() != Instruction::ExtractElement &&
9623 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9624 TE->getOpcode() == Instruction::PHI;
9635 if (isFullyVectorizableTinyTree(ForReduction))
9640 bool IsAllowedSingleBVNode =
9641 VectorizableTree.size() > 1 ||
9642 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9643 !VectorizableTree.front()->isAltShuffle() &&
9644 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9645 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9647 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
9648 return TE->State == TreeEntry::NeedToGather &&
9650 return isa<ExtractElementInst, UndefValue>(V) ||
9651 (IsAllowedSingleBVNode &&
9652 !V->hasNUsesOrMore(UsesLimit) &&
9653 any_of(V->users(), IsaPred<InsertElementInst>));
9658 assert(VectorizableTree.empty()
9659 ? ExternalUses.empty()
9660 :
true &&
"We shouldn't have any external users");
9672 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9685 for (
const auto &TEPtr : VectorizableTree) {
9686 if (TEPtr->State != TreeEntry::Vectorize)
9688 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9694 auto *NodeA = DT->
getNode(
A->getParent());
9695 auto *NodeB = DT->
getNode(
B->getParent());
9696 assert(NodeA &&
"Should only process reachable instructions");
9697 assert(NodeB &&
"Should only process reachable instructions");
9698 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9699 "Different nodes should have different DFS numbers");
9701 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9702 return B->comesBefore(
A);
9712 LiveValues.
erase(PrevInst);
9713 for (
auto &J : PrevInst->
operands()) {
9714 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
9715 LiveValues.
insert(cast<Instruction>(&*J));
9719 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
9720 for (
auto *
X : LiveValues)
9721 dbgs() <<
" " <<
X->getName();
9722 dbgs() <<
", Looking at ";
9727 unsigned NumCalls = 0;
9731 while (InstIt != PrevInstIt) {
9733 PrevInstIt = Inst->getParent()->rbegin();
9738 if (
auto *II = dyn_cast<IntrinsicInst>(
I)) {
9739 if (II->isAssumeLikeIntrinsic())
9743 for (
auto &ArgOp : II->args())
9745 if (
auto *FPMO = dyn_cast<FPMathOperator>(II))
9746 FMF = FPMO->getFastMathFlags();
9753 if (IntrCost < CallCost)
9760 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9761 &*PrevInstIt != PrevInst)
9769 for (
auto *II : LiveValues) {
9770 auto *ScalarTy = II->getType();
9771 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
9772 ScalarTy = VectorTy->getElementType();
9790 const auto *I1 = IE1;
9791 const auto *I2 = IE2;
9803 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9805 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
9806 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
9808 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
9809 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9817 template <
typename U>
9818 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
9821 template <
typename U>
9822 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
9840template <
typename T>
9846 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
9848 auto VMIt = std::next(ShuffleMask.begin());
9851 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
9853 if (!IsBaseUndef.
all()) {
9855 std::pair<T *, bool> Res =
9856 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
9858 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
9862 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
9864 auto *V = ValueSelect::get<T *>(
Base);
9866 assert((!V || GetVF(V) == Mask.size()) &&
9867 "Expected base vector of VF number of elements.");
9868 Prev = Action(Mask, {
nullptr, Res.first});
9869 }
else if (ShuffleMask.size() == 1) {
9872 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9878 Prev = Action(Mask, {ShuffleMask.begin()->first});
9882 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9883 unsigned Vec2VF = GetVF(VMIt->first);
9884 if (Vec1VF == Vec2VF) {
9888 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9891 Mask[
I] = SecMask[
I] + Vec1VF;
9894 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9897 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9899 std::pair<T *, bool> Res2 =
9900 ResizeAction(VMIt->first, VMIt->second,
false);
9902 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9909 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
9912 Prev = Action(Mask, {Res1.first, Res2.first});
9914 VMIt = std::next(VMIt);
9916 bool IsBaseNotUndef = !IsBaseUndef.
all();
9917 (void)IsBaseNotUndef;
9919 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9921 std::pair<T *, bool> Res =
9922 ResizeAction(VMIt->first, VMIt->second,
false);
9924 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
9927 "Multiple uses of scalars.");
9928 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
9933 Prev = Action(Mask, {Prev, Res.first});
9941 << VectorizableTree.size() <<
".\n");
9943 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9946 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
9947 TreeEntry &TE = *VectorizableTree[
I];
9948 if (TE.State == TreeEntry::NeedToGather) {
9949 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
9950 E && E->getVectorFactor() == TE.getVectorFactor() &&
9951 E->isSame(TE.Scalars)) {
9956 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9965 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
9975 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
9976 for (ExternalUser &EU : ExternalUses) {
9978 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
9979 !ExtractCostCalculated.
insert(EU.Scalar).second)
9985 if (EphValues.
count(EU.User))
9989 if (isa<FixedVectorType>(EU.Scalar->getType()))
9994 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
9995 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
9996 if (!UsedInserts.
insert(VU).second)
10000 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10003 [
this, VU](
const std::pair<Value *, const TreeEntry *> &Pair) {
10005 VU, cast<InsertElementInst>(Pair.first),
10007 Value *Op0 = II->getOperand(0);
10008 if (getTreeEntry(II) && !getTreeEntry(Op0))
10014 if (It == FirstUsers.
end()) {
10021 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
10022 if (IEBase != EU.User &&
10023 (!IEBase->hasOneUse() ||
10027 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
10030 IEBase = cast<InsertElementInst>(
Base);
10033 "InsertElementInstruction used already.");
10035 Base = IEBase->getOperand(0);
10036 }
while (E == getTreeEntry(
Base));
10039 Base = cast<InsertElementInst>(
Base)->getOperand(0);
10043 VecId = FirstUsers.
size() - 1;
10044 auto It = MinBWs.
find(ScalarTE);
10045 if (It != MinBWs.
end() &&
10047 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
10049 unsigned BWSz = It->second.first;
10050 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
10051 unsigned VecOpcode;
10052 if (DstBWSz < BWSz)
10053 VecOpcode = Instruction::Trunc;
10056 It->second.second ? Instruction::SExt : Instruction::ZExt;
10062 FTy->getNumElements()),
10065 <<
" for extending externally used vector with "
10066 "non-equal minimum bitwidth.\n");
10072 VecId = std::distance(FirstUsers.
begin(), It);
10074 int InIdx = *InsertIdx;
10078 Mask[InIdx] = EU.Lane;
10079 DemandedElts[VecId].setBit(InIdx);
10087 if (
auto *
GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10088 if (!ValueToExtUses) {
10089 ValueToExtUses.emplace();
10091 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
10097 if (!getTreeEntry(V))
10099 auto It = ValueToExtUses->find(V);
10100 if (It != ValueToExtUses->end()) {
10102 ExternalUses[It->second].User = nullptr;
10107 if (CanBeUsedAsGEP) {
10109 ExternalUsesAsGEPs.
insert(EU.Scalar);
10118 auto It = MinBWs.
find(getTreeEntry(EU.Scalar));
10119 if (It != MinBWs.
end()) {
10122 It->second.second ? Instruction::SExt : Instruction::ZExt;
10132 if (!VectorizedVals.
empty()) {
10133 const TreeEntry &Root = *VectorizableTree.front().get();
10134 auto BWIt = MinBWs.find(&Root);
10135 if (BWIt != MinBWs.end()) {
10136 Type *DstTy = Root.Scalars.front()->getType();
10137 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
10139 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10140 if (OriginalSz != SrcSz) {
10141 unsigned Opcode = Instruction::Trunc;
10142 if (OriginalSz > SrcSz)
10143 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10153 Cost += SpillCost + ExtractCost;
10157 unsigned VF =
Mask.size();
10158 unsigned VecVF =
TE->getVectorFactor();
10160 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
10163 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
10169 dbgs() <<
"SLP: Adding cost " <<
C
10170 <<
" for final shuffle of insertelement external users.\n";
10171 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10173 return std::make_pair(TE,
true);
10175 return std::make_pair(TE,
false);
10178 for (
int I = 0, E = FirstUsers.size();
I < E; ++
I) {
10179 Value *
Base = cast<Instruction>(FirstUsers[
I].first)->getOperand(0);
10180 auto Vector = ShuffleMasks[
I].takeVector();
10184 assert((TEs.size() == 1 || TEs.size() == 2) &&
10185 "Expected exactly 1 or 2 tree entries.");
10186 if (TEs.size() == 1) {
10188 VF = TEs.front()->getVectorFactor();
10194 (
Data.index() < VF &&
10195 static_cast<int>(
Data.index()) ==
Data.value());
10200 <<
" for final shuffle of insertelement "
10201 "external users.\n";
10202 TEs.front()->
dump();
10203 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10209 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10210 VF = TEs.front()->getVectorFactor();
10219 <<
" for final shuffle of vector node and external "
10220 "insertelement users.\n";
10221 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10222 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10228 (void)performExtractsShuffleAction<const TreeEntry>(
10230 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
10231 EstimateShufflesCost);
10233 cast<FixedVectorType>(FirstUsers[
I].first->getType()), DemandedElts[
I],
10235 Cost -= InsertCost;
10239 if (ReductionBitWidth != 0) {
10240 assert(UserIgnoreList &&
"Expected reduction tree.");
10241 const TreeEntry &E = *VectorizableTree.front().get();
10242 auto It = MinBWs.find(&E);
10243 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10244 unsigned SrcSize = It->second.first;
10245 unsigned DstSize = ReductionBitWidth;
10246 unsigned Opcode = Instruction::Trunc;
10247 if (SrcSize < DstSize)
10248 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10255 switch (E.getOpcode()) {
10256 case Instruction::SExt:
10257 case Instruction::ZExt:
10258 case Instruction::Trunc: {
10259 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10260 CCH = getCastContextHint(*OpTE);
10270 <<
" for final resize for reduction from " << SrcVecTy
10271 <<
" to " << DstVecTy <<
"\n";
10272 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
10280 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
10281 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
10282 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
10286 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
10297std::optional<TTI::ShuffleKind>
10298BoUpSLP::tryToGatherSingleRegisterExtractElements(
10304 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
10305 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10307 if (isa<UndefValue>(VL[
I]))
10311 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10312 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10321 ExtractMask.reset(*
Idx);
10326 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
10330 for (
const auto &
Data : VectorOpToIdx)
10331 VFToVector[cast<FixedVectorType>(
Data.first->getType())->getNumElements()]
10332 .push_back(
Data.first);
10333 for (
auto &
Data : VFToVector) {
10335 return VectorOpToIdx.find(V1)->second.size() >
10336 VectorOpToIdx.find(V2)->second.size();
10341 const int UndefSz = UndefVectorExtracts.
size();
10342 unsigned SingleMax = 0;
10343 Value *SingleVec =
nullptr;
10344 unsigned PairMax = 0;
10345 std::pair<Value *, Value *> PairVec(
nullptr,
nullptr);
10346 for (
auto &
Data : VFToVector) {
10348 if (SingleMax < VectorOpToIdx[V1].
size() + UndefSz) {
10349 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10353 if (
Data.second.size() > 1)
10354 V2 = *std::next(
Data.second.begin());
10355 if (V2 && PairMax < VectorOpToIdx[V1].
size() + VectorOpToIdx[V2].
size() +
10357 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[
V2].size() + UndefSz;
10358 PairVec = std::make_pair(V1, V2);
10361 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10362 return std::nullopt;
10368 if (SingleMax >= PairMax && SingleMax) {
10369 for (
int Idx : VectorOpToIdx[SingleVec])
10372 for (
Value *V : {PairVec.first, PairVec.second})
10373 for (
int Idx : VectorOpToIdx[V])
10377 for (
int Idx : UndefVectorExtracts)
10381 std::optional<TTI::ShuffleKind> Res =
10387 return std::nullopt;
10391 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
10392 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
10393 isa<UndefValue>(GatheredExtracts[
I])) {
10397 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
10398 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10399 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10414 unsigned NumParts)
const {
10415 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
10418 unsigned SliceSize = VL.
size() / NumParts;
10419 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10425 std::optional<TTI::ShuffleKind> Res =
10426 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10427 ShufflesRes[Part] = Res;
10428 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
10430 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
10431 return Res.has_value();
10433 ShufflesRes.clear();
10434 return ShufflesRes;
10437std::optional<TargetTransformInfo::ShuffleKind>
10438BoUpSLP::isGatherShuffledSingleRegisterEntry(
10444 const EdgeInfo &TEUseEI =
TE->UserTreeIndices.front();
10445 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10449 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10450 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10453 TEInsertBlock = TEInsertPt->
getParent();
10456 return std::nullopt;
10457 auto *NodeUI = DT->
getNode(TEInsertBlock);
10458 assert(NodeUI &&
"Should only process reachable instructions");
10460 auto CheckOrdering = [&](
const Instruction *InsertPt) {
10474 auto *NodeEUI = DT->
getNode(InsertBlock);
10477 assert((NodeUI == NodeEUI) ==
10478 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10479 "Different nodes should have different DFS numbers");
10481 if (TEInsertPt->
getParent() != InsertBlock &&
10484 if (TEInsertPt->
getParent() == InsertBlock &&
10498 for (
Value *V : VL) {
10503 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10507 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
10508 "Must contain at least single gathered value.");
10509 assert(TEPtr->UserTreeIndices.size() == 1 &&
10510 "Expected only single user of a gather node.");
10511 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10513 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10516 : &getLastInstructionInBundle(UseEI.UserTE);
10517 if (TEInsertPt == InsertPt) {
10521 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10525 if (TEUseEI.UserTE != UseEI.UserTE &&
10526 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10532 if ((TEInsertBlock != InsertPt->
getParent() ||
10533 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10534 !CheckOrdering(InsertPt))
10538 if (
const TreeEntry *VTE = getTreeEntry(V)) {
10540 if (VTE->State != TreeEntry::Vectorize) {
10541 auto It = MultiNodeScalars.
find(V);
10542 if (It == MultiNodeScalars.
end())
10544 VTE = *It->getSecond().begin();
10546 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
10547 return MTE->State == TreeEntry::Vectorize;
10549 if (MIt == It->getSecond().end())
10554 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10555 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10559 if (VToTEs.
empty())
10561 if (UsedTEs.
empty()) {
10575 if (!VToTEs.
empty()) {
10581 VToTEs = SavedVToTEs;
10590 if (UsedTEs.
size() == 2)
10592 UsedTEs.push_back(SavedVToTEs);
10599 if (UsedTEs.
empty()) {
10601 return std::nullopt;
10605 if (UsedTEs.
size() == 1) {
10608 UsedTEs.front().
end());
10609 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10610 return TE1->Idx < TE2->Idx;
10613 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
10614 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
10616 if (It != FirstEntries.end() &&
10617 ((*It)->getVectorFactor() == VL.size() ||
10618 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
10619 TE->ReuseShuffleIndices.size() == VL.size() &&
10620 (*It)->isSame(
TE->Scalars)))) {
10621 Entries.push_back(*It);
10622 if ((*It)->getVectorFactor() == VL.size()) {
10623 std::iota(std::next(
Mask.begin(), Part * VL.size()),
10624 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
10630 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
10631 if (isa<PoisonValue>(VL[
I]))
10637 Entries.push_back(FirstEntries.front());
10640 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
10643 for (
const TreeEntry *TE : UsedTEs.front()) {
10644 unsigned VF =
TE->getVectorFactor();
10645 auto It = VFToTE.
find(VF);
10646 if (It != VFToTE.
end()) {
10647 if (It->second->Idx >
TE->Idx)
10648 It->getSecond() =
TE;
10655 UsedTEs.back().
end());
10656 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10657 return TE1->Idx < TE2->Idx;
10659 for (
const TreeEntry *TE : SecondEntries) {
10660 auto It = VFToTE.
find(
TE->getVectorFactor());
10661 if (It != VFToTE.
end()) {
10663 Entries.push_back(It->second);
10664 Entries.push_back(TE);
10670 if (Entries.empty()) {
10672 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
10673 return TE1->Idx < TE2->Idx;
10675 Entries.push_back(SecondEntries.front());
10676 VF = std::max(Entries.front()->getVectorFactor(),
10677 Entries.back()->getVectorFactor());
10681 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
10684 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
10685 auto *
PHI = cast<PHINode>(V);
10686 auto *PHI1 = cast<PHINode>(V1);
10691 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
10693 Value *In1 = PHI1->getIncomingValue(
I);
10698 if (cast<Instruction>(In)->
getParent() !=
10708 auto MightBeIgnored = [=](
Value *
V) {
10709 auto *
I = dyn_cast<Instruction>(V);
10710 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
10712 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
10717 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
10719 bool UsedInSameVTE =
false;
10720 auto It = UsedValuesEntry.
find(V1);
10721 if (It != UsedValuesEntry.
end())
10722 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
10723 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10725 cast<Instruction>(V)->getParent() ==
10726 cast<Instruction>(V1)->getParent() &&
10727 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
10732 for (
int I = 0, E = VL.size();
I < E; ++
I) {
10734 auto It = UsedValuesEntry.
find(V);
10735 if (It == UsedValuesEntry.
end())
10741 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
10742 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
10744 unsigned Idx = It->second;
10751 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
10752 if (!UsedIdxs.test(
I))
10758 for (std::pair<unsigned, int> &Pair : EntryLanes)
10759 if (Pair.first ==
I)
10760 Pair.first = TempEntries.
size();
10763 Entries.swap(TempEntries);
10764 if (EntryLanes.size() == Entries.size() &&
10766 .
slice(Part * VL.size(),
10767 std::min<int>(VL.size(),
TE->Scalars.size())))) {
10773 return std::nullopt;
10776 bool IsIdentity = Entries.size() == 1;
10779 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
10780 unsigned Idx = Part * VL.size() + Pair.second;
10783 (ForOrder ? std::distance(
10784 Entries[Pair.first]->Scalars.begin(),
10785 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
10786 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
10787 IsIdentity &=
Mask[
Idx] == Pair.second;
10789 switch (Entries.size()) {
10791 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10795 if (EntryLanes.size() > 2 || VL.size() <= 2)
10803 std::fill(std::next(
Mask.begin(), Part * VL.size()),
10805 return std::nullopt;
10809BoUpSLP::isGatherShuffledEntry(
10813 assert(NumParts > 0 && NumParts < VL.
size() &&
10814 "Expected positive number of registers.");
10817 if (TE == VectorizableTree.front().get())
10820 if (
TE->isNonPowOf2Vec())
10823 assert(
TE->UserTreeIndices.size() == 1 &&
10824 "Expected only single user of the gather node.");
10826 "Number of scalars must be divisible by NumParts.");
10827 unsigned SliceSize = VL.
size() / NumParts;
10829 for (
unsigned Part = 0; Part < NumParts; ++Part) {
10832 std::optional<TTI::ShuffleKind> SubRes =
10833 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
10836 SubEntries.
clear();
10839 SubEntries.
front()->getVectorFactor() == VL.
size() &&
10840 (SubEntries.
front()->isSame(
TE->Scalars) ||
10841 SubEntries.
front()->isSame(VL))) {
10843 LocalSubEntries.
swap(SubEntries);
10846 std::iota(
Mask.begin(),
Mask.end(), 0);
10848 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
10849 if (isa<PoisonValue>(VL[
I]))
10851 Entries.emplace_back(1, LocalSubEntries.
front());
10857 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
10865 bool ForPoisonSrc)
const {
10867 Type *ScalarTy = VL[0]->getType();
10868 if (
StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
10869 ScalarTy =
SI->getValueOperand()->getType();
10871 bool DuplicateNonConst =
false;
10879 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
10886 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
10889 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
10897 EstimateInsertCost(
I, V);
10898 ShuffleMask[
I] =
I;
10902 DuplicateNonConst =
true;
10904 ShuffleMask[
I] = Res.first->second;
10910 if (DuplicateNonConst)
10912 VecTy, ShuffleMask);
10924 VLOperands Ops(VL, R);
10927 Left = Ops.getVL(0);
10928 Right = Ops.getVL(1);
10931Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
10934 return *Res.second;
10938 auto *Front = E->getMainOp();
10941 if (E->getOpcode() == Instruction::GetElementPtr &&
10942 !isa<GetElementPtrInst>(V))
10944 auto *I = cast<Instruction>(V);
10945 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10946 isVectorLikeInstWithConstOps(I);
10949 auto FindLastInst = [&]() {
10951 for (
Value *V : E->Scalars) {
10952 auto *
I = dyn_cast<Instruction>(V);
10955 if (LastInst->
getParent() ==
I->getParent()) {
10960 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10961 !isa<GetElementPtrInst>(
I)) ||
10964 "Expected vector-like or non-GEP in GEP node insts only.");
10972 auto *NodeB = DT->
getNode(
I->getParent());
10973 assert(NodeA &&
"Should only process reachable instructions");
10974 assert(NodeB &&
"Should only process reachable instructions");
10975 assert((NodeA == NodeB) ==
10976 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10977 "Different nodes should have different DFS numbers");
10978 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10985 auto FindFirstInst = [&]() {
10987 for (
Value *V : E->Scalars) {
10988 auto *
I = dyn_cast<Instruction>(V);
10991 if (FirstInst->
getParent() ==
I->getParent()) {
10992 if (
I->comesBefore(FirstInst))
10996 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10997 !isa<GetElementPtrInst>(
I)) ||
11000 "Expected vector-like or non-GEP in GEP node insts only.");
11008 auto *NodeB = DT->
getNode(
I->getParent());
11009 assert(NodeA &&
"Should only process reachable instructions");
11010 assert(NodeB &&
"Should only process reachable instructions");
11011 assert((NodeA == NodeB) ==
11012 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11013 "Different nodes should have different DFS numbers");
11014 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11023 (E->State != TreeEntry::NeedToGather &&
11025 if ((E->getOpcode() == Instruction::GetElementPtr &&
11028 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11032 return !isVectorLikeInstWithConstOps(V) &&
11033 isUsedOutsideBlock(V);
11035 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11037 return isa<ExtractElementInst, UndefValue>(V) ||
11038 areAllOperandsNonInsts(V);
11040 Res.second = FindLastInst();
11042 Res.second = FindFirstInst();
11043 return *Res.second;
11050 if (BlocksSchedules.count(BB)) {
11051 Value *
V = E->isOneOf(E->Scalars.back());
11054 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11055 if (Bundle && Bundle->isPartOfBundle())
11056 for (; Bundle; Bundle = Bundle->NextInBundle)
11057 if (Bundle->OpValue == Bundle->Inst)
11058 Res.second = Bundle->Inst;
11080 Res.second = FindLastInst();
11081 assert(Res.second &&
"Failed to find last instruction in bundle");
11082 return *Res.second;
11085void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
11086 auto *Front = E->getMainOp();
11087 Instruction *LastInst = &getLastInstructionInBundle(E);
11088 assert(LastInst &&
"Failed to find last instruction in bundle");
11091 bool IsPHI = isa<PHINode>(LastInst);
11094 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11096 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
11100 Builder.SetInsertPoint(
11104 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11114 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
11117 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
11118 InsertBB = InsertBB->getSinglePredecessor();
11119 return InsertBB && InsertBB == InstBB;
11121 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11122 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
11123 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11124 getTreeEntry(Inst) ||
11125 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
11126 PostponedIndices.
insert(
I).second)
11130 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
11133 if (cast<VectorType>(Vec->
getType())->getElementType() != Ty) {
11135 "Expected integer types only.");
11136 Vec = Builder.CreateIntCast(
11139 cast<VectorType>(Vec->
getType())->getElementCount()),
11143 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11144 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11147 GatherShuffleExtractSeq.
insert(InsElt);
11148 CSEBlocks.
insert(InsElt->getParent());
11150 if (isa<Instruction>(V)) {
11151 if (TreeEntry *Entry = getTreeEntry(V)) {
11153 User *UserOp =
nullptr;
11155 if (
auto *SI = dyn_cast<Instruction>(Scalar))
11161 unsigned FoundLane = Entry->findLaneForValue(V);
11162 ExternalUses.emplace_back(V, UserOp, FoundLane);
11169 isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
11175 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11183 if (!isa<UndefValue>(VL[
I])) {
11187 if (isa<PoisonValue>(VL[
I]))
11189 if (
auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11194 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11197 for (
int I : NonConsts)
11198 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
11201 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11202 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11240 bool IsFinalized =
false;
11253 class ShuffleIRBuilder {
11266 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11267 CSEBlocks(CSEBlocks),
DL(
DL) {}
11268 ~ShuffleIRBuilder() =
default;
11271 if (V1->
getType() != V2->getType()) {
11274 "Expected integer vector types only.");
11275 if (V1->
getType() != V2->getType()) {
11276 if (cast<VectorType>(V2->getType())
11278 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
11280 ->getIntegerBitWidth())
11289 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11290 GatherShuffleExtractSeq.
insert(
I);
11291 CSEBlocks.
insert(
I->getParent());
11300 unsigned VF = Mask.size();
11301 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11305 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
11306 GatherShuffleExtractSeq.
insert(
I);
11307 CSEBlocks.
insert(
I->getParent());
11311 Value *createIdentity(
Value *V) {
return V; }
11312 Value *createPoison(
Type *Ty,
unsigned VF) {
11317 void resizeToMatch(
Value *&V1,
Value *&V2) {
11318 if (V1->
getType() == V2->getType())
11320 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
11321 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11322 int VF = std::max(V1VF, V2VF);
11323 int MinVF = std::min(V1VF, V2VF);
11325 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
11327 Value *&
Op = MinVF == V1VF ? V1 : V2;
11329 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
11330 GatherShuffleExtractSeq.
insert(
I);
11331 CSEBlocks.
insert(
I->getParent());
11344 assert(V1 &&
"Expected at least one vector value.");
11345 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11346 R.CSEBlocks, *R.DL);
11347 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11355 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11362 : Builder(Builder), R(R) {}
11366 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11367 unsigned NumParts,
bool &UseVecBaseAsInput) {
11368 UseVecBaseAsInput =
false;
11370 Value *VecBase =
nullptr;
11371 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
11375 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
11376 VecBase = EI->getVectorOperand();
11377 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
11378 VecBase = TE->VectorizedValue;
11379 assert(VecBase &&
"Expected vectorized value.");
11380 UniqueBases.
insert(VecBase);
11383 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
11385 const TreeEntry *UTE = R.getTreeEntry(U);
11386 return !UTE || R.MultiNodeScalars.contains(U) ||
11387 count_if(R.VectorizableTree,
11388 [&](const std::unique_ptr<TreeEntry> &TE) {
11389 return any_of(TE->UserTreeIndices,
11390 [&](const EdgeInfo &Edge) {
11391 return Edge.UserTE == UTE;
11393 is_contained(TE->Scalars, EI);
11397 R.eraseInstruction(EI);
11399 if (NumParts == 1 || UniqueBases.
size() == 1)
11401 UseVecBaseAsInput =
true;
11411 Value *Vec =
nullptr;
11413 unsigned SliceSize = E->Scalars.size() / NumParts;
11414 for (
unsigned Part = 0; Part < NumParts; ++Part) {
11418 constexpr int MaxBases = 2;
11426 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11427 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
11428 VecOp = TE->VectorizedValue;
11429 assert(VecOp &&
"Expected vectorized value.");
11431 cast<FixedVectorType>(VecOp->
getType())->getNumElements();
11433 assert((PrevSize ==
Size || PrevSize == 0) &&
11434 "Expected vectors of the same size.");
11437 Bases[SubMask[
I] <
Size ? 0 : 1] = VecOp;
11439 if (!Bases.front())
11442 if (Bases.back()) {
11443 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11444 TransformToIdentity(SubMask);
11446 SubVec = Bases.front();
11453 Mask.slice(
P * SliceSize, SliceSize);
11458 "Expected first part or all previous parts masked.");
11459 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11461 unsigned VF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11463 unsigned SubVecVF =
11464 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
11465 VF = std::max(VF, SubVecVF);
11468 for (
int &
Idx : SubMask)
11471 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11472 Vec = createShuffle(Vec, SubVec, VecMask);
11473 TransformToIdentity(VecMask);
11481 std::optional<Value *>
11487 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
11489 return std::nullopt;
11493 E->getVectorFactor());
11501 add(E1.VectorizedValue, E2.VectorizedValue, Mask);
11506 add(E1.VectorizedValue, Mask);
11510 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
11511 if (InVectors.
empty()) {
11514 CommonMask.
assign(Mask.begin(), Mask.end());
11518 if (InVectors.
size() == 2) {
11519 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11520 transformMaskAfterShuffle(CommonMask, CommonMask);
11521 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
11523 Vec = createShuffle(Vec,
nullptr, CommonMask);
11524 transformMaskAfterShuffle(CommonMask, CommonMask);
11526 V1 = createShuffle(V1, V2, Mask);
11527 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11529 CommonMask[
Idx] =
Idx + Sz;
11530 InVectors.
front() = Vec;
11531 if (InVectors.
size() == 2)
11532 InVectors.
back() = V1;
11538 if (InVectors.
empty()) {
11539 if (!isa<FixedVectorType>(V1->
getType())) {
11540 V1 = createShuffle(V1,
nullptr, CommonMask);
11542 transformMaskAfterShuffle(CommonMask, Mask);
11545 CommonMask.
assign(Mask.begin(), Mask.end());
11548 const auto *It =
find(InVectors, V1);
11549 if (It == InVectors.
end()) {
11550 if (InVectors.
size() == 2 ||
11552 !isa<FixedVectorType>(V1->
getType())) {
11554 if (InVectors.
size() == 2) {
11555 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11556 transformMaskAfterShuffle(CommonMask, CommonMask);
11557 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11558 CommonMask.
size()) {
11559 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
11560 transformMaskAfterShuffle(CommonMask, CommonMask);
11562 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11565 V->getType() != V1->
getType()
11567 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
11568 ->getNumElements();
11569 if (V->getType() != V1->
getType())
11570 V1 = createShuffle(V1,
nullptr, Mask);
11571 InVectors.
front() = V;
11572 if (InVectors.
size() == 2)
11573 InVectors.
back() = V1;
11580 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11586 int VF = CommonMask.
size();
11587 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
11588 VF = FTy->getNumElements();
11589 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11591 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
11600 Value *Root =
nullptr) {
11601 return R.gather(VL, Root);
11610 IsFinalized =
true;
11613 if (InVectors.
size() == 2) {
11614 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
11617 Vec = createShuffle(Vec,
nullptr, CommonMask);
11619 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
11623 "Expected vector length for the final value before action.");
11624 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
11627 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11628 Vec = createShuffle(Vec,
nullptr, ResizeMask);
11630 Action(Vec, CommonMask);
11631 InVectors.
front() = Vec;
11633 if (!ExtMask.
empty()) {
11634 if (CommonMask.
empty()) {
11638 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11641 NewMask[
I] = CommonMask[ExtMask[
I]];
11643 CommonMask.
swap(NewMask);
11646 if (CommonMask.
empty()) {
11647 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11648 return InVectors.
front();
11650 if (InVectors.
size() == 2)
11651 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
11652 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
11657 "Shuffle construction must be finalized.");
11661Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
11662 bool PostponedPHIs) {
11663 ValueList &VL = E->getOperand(NodeIdx);
11664 const unsigned VF = VL.size();
11667 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11668 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
11669 if (It != VL.end())
11672 if (S.getOpcode()) {
11673 auto CheckSameVE = [&](
const TreeEntry *VE) {
11674 return VE->isSame(VL) &&
11675 (
any_of(VE->UserTreeIndices,
11676 [E, NodeIdx](
const EdgeInfo &EI) {
11677 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11679 any_of(VectorizableTree,
11680 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
11681 return TE->isOperandGatherNode({E, NodeIdx}) &&
11682 VE->isSame(TE->Scalars);
11685 TreeEntry *VE = getTreeEntry(S.OpValue);
11686 bool IsSameVE = VE && CheckSameVE(VE);
11688 auto It = MultiNodeScalars.
find(S.OpValue);
11689 if (It != MultiNodeScalars.
end()) {
11690 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
11691 return TE != VE && CheckSameVE(TE);
11693 if (
I != It->getSecond().end()) {
11701 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
11702 ShuffleBuilder.add(V, Mask);
11703 return ShuffleBuilder.finalize(std::nullopt);
11706 if (VF != cast<FixedVectorType>(
V->getType())->getNumElements()) {
11707 if (!VE->ReuseShuffleIndices.empty()) {
11728 if (isa<PoisonValue>(V))
11730 Mask[
I] = VE->findLaneForValue(V);
11732 V = FinalShuffle(V, Mask);
11734 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
11735 "Expected vectorization factor less "
11736 "than original vector size.");
11738 std::iota(UniformMask.begin(), UniformMask.end(), 0);
11739 V = FinalShuffle(V, UniformMask);
11745 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11746 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11747 }) == VE->UserTreeIndices.end()) {
11749 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11750 return TE->State == TreeEntry::NeedToGather &&
11751 TE->UserTreeIndices.front().UserTE == E &&
11752 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11754 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
11755 (*It)->VectorizedValue =
V;
11764 auto *
I =
find_if(VectorizableTree,
11765 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
11766 return TE->isOperandGatherNode({E, NodeIdx});
11768 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
11769 assert(
I->get()->UserTreeIndices.size() == 1 &&
11770 "Expected only single user for the gather node.");
11771 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
11775template <
typename BVTy,
typename ResTy,
typename...
Args>
11776ResTy BoUpSLP::processBuildVector(
const TreeEntry *E, Args &...Params) {
11777 assert(E->State == TreeEntry::NeedToGather &&
"Expected gather node.");
11778 unsigned VF = E->getVectorFactor();
11780 bool NeedFreeze =
false;
11782 E->ReuseShuffleIndices.end());
11788 if (!ReorderMask.
empty())
11791 unsigned I,
unsigned SliceSize) {
11793 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
11796 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11797 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11798 if (UserTE->getNumOperands() != 2)
11801 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
11802 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
11803 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11804 }) !=
TE->UserTreeIndices.end();
11806 if (It == VectorizableTree.end())
11809 if ((
Mask.size() < InputVF &&
11812 (
Mask.size() == InputVF &&
11814 std::iota(std::next(
Mask.begin(),
I * SliceSize),
11815 std::next(
Mask.begin(), (
I + 1) * SliceSize), 0);
11819 std::fill(std::next(
Mask.begin(),
I * SliceSize),
11820 std::next(
Mask.begin(), (
I + 1) * SliceSize), IVal);
11824 BVTy ShuffleBuilder(Params...);
11825 ResTy Res = ResTy();
11829 Value *ExtractVecBase =
nullptr;
11830 bool UseVecBaseAsInput =
false;
11833 Type *ScalarTy = GatheredScalars.front()->getType();
11836 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11838 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
11840 bool Resized =
false;
11842 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
11843 if (!ExtractShuffles.
empty()) {
11848 if (
const auto *TE = getTreeEntry(
11849 cast<ExtractElementInst>(E->Scalars[
Idx])->getVectorOperand()))
11852 if (std::optional<ResTy> Delayed =
11853 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11855 PostponedGathers.
insert(E);
11860 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
11861 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11862 ExtractVecBase = VecBase;
11863 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
11864 if (VF == VecBaseTy->getNumElements() &&
11865 GatheredScalars.size() != VF) {
11867 GatheredScalars.append(VF - GatheredScalars.size(),
11873 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
11874 E->isAltShuffle() ||
11875 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
11877 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11879 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
11881 if (!GatherShuffles.
empty()) {
11882 if (std::optional<ResTy> Delayed =
11883 ShuffleBuilder.needToDelay(E, Entries)) {
11885 PostponedGathers.
insert(E);
11890 if (GatherShuffles.
size() == 1 &&
11892 Entries.front().front()->isSame(E->Scalars)) {
11897 <<
"SLP: perfect diamond match for gather bundle "
11900 Mask.resize(E->Scalars.size());
11901 const TreeEntry *FrontTE = Entries.front().front();
11902 if (FrontTE->ReorderIndices.empty() &&
11903 ((FrontTE->ReuseShuffleIndices.empty() &&
11904 E->Scalars.size() == FrontTE->Scalars.size()) ||
11905 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11906 std::iota(
Mask.begin(),
Mask.end(), 0);
11909 if (isa<PoisonValue>(V)) {
11913 Mask[
I] = FrontTE->findLaneForValue(V);
11916 ShuffleBuilder.add(*FrontTE, Mask);
11917 Res = ShuffleBuilder.finalize(E->getCommonMask());
11921 if (GatheredScalars.size() != VF &&
11923 return any_of(TEs, [&](
const TreeEntry *TE) {
11924 return TE->getVectorFactor() == VF;
11927 GatheredScalars.append(VF - GatheredScalars.size(),
11931 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
11939 bool IsRootPoison) {
11942 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
11949 int NumNonConsts = 0;
11952 if (isa<UndefValue>(V)) {
11953 if (!isa<PoisonValue>(V)) {
11968 Scalars.
front() = OrigV;
11971 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
11972 Scalars[Res.first->second] = OrigV;
11973 ReuseMask[
I] = Res.first->second;
11976 if (NumNonConsts == 1) {
11981 if (!UndefPos.
empty() && UndefPos.
front() == 0)
11984 ReuseMask[SinglePos] = SinglePos;
11985 }
else if (!UndefPos.
empty() && IsSplat) {
11990 return !isa<UndefValue>(V) &&
11992 (E->UserTreeIndices.size() == 1 &&
11996 return E->UserTreeIndices.front().EdgeIdx !=
11997 U.getOperandNo() &&
11999 E->UserTreeIndices.front().UserTE->Scalars,
12003 if (It != Scalars.
end()) {
12005 int Pos = std::distance(Scalars.
begin(), It);
12006 for (
int I : UndefPos) {
12008 ReuseMask[
I] = Pos;
12017 for (
int I : UndefPos) {
12019 if (isa<UndefValue>(Scalars[
I]))
12026 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
12027 bool IsNonPoisoned =
true;
12028 bool IsUsedInExpr =
true;
12029 Value *Vec1 =
nullptr;
12030 if (!ExtractShuffles.
empty()) {
12034 Value *Vec2 =
nullptr;
12035 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12039 if (UseVecBaseAsInput) {
12040 Vec1 = ExtractVecBase;
12042 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
12045 if (isa<UndefValue>(E->Scalars[
I]))
12047 auto *EI = cast<ExtractElementInst>(E->Scalars[
I]);
12048 Value *VecOp = EI->getVectorOperand();
12049 if (
const auto *TE = getTreeEntry(VecOp))
12050 if (
TE->VectorizedValue)
12051 VecOp =
TE->VectorizedValue;
12054 }
else if (Vec1 != VecOp) {
12055 assert((!Vec2 || Vec2 == VecOp) &&
12056 "Expected only 1 or 2 vectors shuffle.");
12062 IsUsedInExpr =
false;
12065 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12067 IsUsedInExpr &= FindReusedSplat(
12069 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
12070 ExtractMask.size());
12071 ShuffleBuilder.add(Vec1, ExtractMask,
true);
12074 IsUsedInExpr =
false;
12076 ScalarTy, GatheredScalars.size())),
12077 ExtractMask,
true);
12080 if (!GatherShuffles.
empty()) {
12081 unsigned SliceSize = E->Scalars.size() / NumParts;
12083 for (
const auto [
I, TEs] :
enumerate(Entries)) {
12086 "No shuffles with empty entries list expected.");
12090 "Expected shuffle of 1 or 2 entries.");
12093 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
12094 if (TEs.
size() == 1) {
12095 IsUsedInExpr &= FindReusedSplat(
12096 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
12097 ShuffleBuilder.add(*TEs.
front(), VecMask);
12098 if (TEs.
front()->VectorizedValue)
12102 IsUsedInExpr =
false;
12103 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
12104 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
12115 int EMSz = ExtractMask.size();
12116 int MSz =
Mask.size();
12119 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
12120 bool IsIdentityShuffle =
12121 ((UseVecBaseAsInput ||
12123 [](
const std::optional<TTI::ShuffleKind> &SK) {
12127 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
12129 (!GatherShuffles.
empty() &&
12131 [](
const std::optional<TTI::ShuffleKind> &SK) {
12135 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
12137 bool EnoughConstsForShuffle =
12141 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12145 return isa<Constant>(V) && !isa<UndefValue>(V);
12147 (!IsIdentityShuffle ||
12148 (GatheredScalars.size() == 2 &&
12150 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
12152 return isa<Constant>(V) && !isa<PoisonValue>(V);
12156 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
12157 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
12163 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12165 TryPackScalars(GatheredScalars, BVMask,
true);
12166 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12167 ShuffleBuilder.add(BV, BVMask);
12170 return isa<PoisonValue>(V) ||
12171 (IsSingleShuffle && ((IsIdentityShuffle &&
12172 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12174 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12176 Res = ShuffleBuilder.finalize(
12177 E->ReuseShuffleIndices, E->Scalars.size(),
12179 TryPackScalars(NonConstants, Mask,
false);
12180 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
12185 TryPackScalars(GatheredScalars, ReuseMask,
true);
12186 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
12187 ShuffleBuilder.add(BV, ReuseMask);
12188 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12193 if (!isa<PoisonValue>(V))
12196 Value *BV = ShuffleBuilder.gather(E->Scalars);
12197 ShuffleBuilder.add(BV, Mask);
12198 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12202 Res = ShuffleBuilder.createFreeze(Res);
12206Value *BoUpSLP::createBuildVector(
const TreeEntry *E) {
12207 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
12214 if (E->VectorizedValue &&
12215 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12216 E->isAltShuffle())) {
12217 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
12218 return E->VectorizedValue;
12221 if (E->State == TreeEntry::NeedToGather) {
12223 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12224 setInsertPointAfterBundle(E);
12225 Value *Vec = createBuildVector(E);
12226 E->VectorizedValue = Vec;
12231 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E,
VectorType *VecTy) {
12232 ShuffleInstructionBuilder ShuffleBuilder(Builder, *
this);
12233 if (E->getOpcode() == Instruction::Store) {
12235 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
12236 E->ReorderIndices.size());
12237 ShuffleBuilder.add(V, Mask);
12238 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12239 ShuffleBuilder.addOrdered(V, std::nullopt);
12241 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12243 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12246 assert((E->State == TreeEntry::Vectorize ||
12247 E->State == TreeEntry::ScatterVectorize ||
12248 E->State == TreeEntry::StridedVectorize) &&
12249 "Unhandled state");
12250 unsigned ShuffleOrOp =
12251 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
12254 if (
auto *Store = dyn_cast<StoreInst>(VL0))
12255 ScalarTy =
Store->getValueOperand()->getType();
12256 else if (
auto *IE = dyn_cast<InsertElementInst>(VL0))
12257 ScalarTy =
IE->getOperand(1)->getType();
12258 auto It = MinBWs.
find(E);
12259 if (It != MinBWs.
end())
12261 auto GetOperandSignedness = [&](
unsigned Idx) {
12262 const TreeEntry *OpE = getOperandEntry(E,
Idx);
12263 bool IsSigned =
false;
12264 auto It = MinBWs.
find(OpE);
12265 if (It != MinBWs.
end())
12266 IsSigned = It->second.second;
12269 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12274 switch (ShuffleOrOp) {
12275 case Instruction::PHI: {
12276 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12277 E != VectorizableTree.front().get() ||
12278 !E->UserTreeIndices.empty()) &&
12279 "PHI reordering is free.");
12280 if (PostponedPHIs && E->VectorizedValue)
12281 return E->VectorizedValue;
12282 auto *PH = cast<PHINode>(VL0);
12284 PH->getParent()->getFirstNonPHIIt());
12286 if (PostponedPHIs || !E->VectorizedValue) {
12293 PH->getParent()->getFirstInsertionPt());
12296 V = FinalShuffle(V, E, VecTy);
12298 E->VectorizedValue =
V;
12302 PHINode *NewPhi = cast<PHINode>(E->PHI);
12311 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12317 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12321 if (!VisitedBBs.
insert(IBB).second) {
12328 Value *Vec = vectorizeOperand(E,
I,
true);
12329 if (VecTy != Vec->
getType()) {
12331 getOperandEntry(E,
I)->State == TreeEntry::NeedToGather ||
12332 MinBWs.
contains(getOperandEntry(E,
I))) &&
12333 "Expected item in MinBWs.");
12334 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
12340 "Invalid number of incoming values");
12344 case Instruction::ExtractElement: {
12345 Value *
V = E->getSingleOperand(0);
12346 if (
const TreeEntry *TE = getTreeEntry(V))
12347 V =
TE->VectorizedValue;
12348 setInsertPointAfterBundle(E);
12349 V = FinalShuffle(V, E, VecTy);
12350 E->VectorizedValue =
V;
12353 case Instruction::ExtractValue: {
12354 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12359 NewV = FinalShuffle(NewV, E, VecTy);
12360 E->VectorizedValue = NewV;
12363 case Instruction::InsertElement: {
12364 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
12366 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
12368 Type *ScalarTy =
Op.front()->getType();
12369 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
12371 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
12372 assert(Res.first > 0 &&
"Expected item in MinBWs.");
12377 cast<FixedVectorType>(
V->getType())->getNumElements()),
12382 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
12383 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12385 const unsigned NumElts =
12386 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12387 const unsigned NumScalars = E->Scalars.size();
12390 assert(
Offset < NumElts &&
"Failed to find vector index offset");
12394 if (!E->ReorderIndices.empty()) {
12399 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
12402 bool IsIdentity =
true;
12404 Mask.swap(PrevMask);
12405 for (
unsigned I = 0;
I < NumScalars; ++
I) {
12408 IsIdentity &= InsertIdx -
Offset ==
I;
12411 if (!IsIdentity || NumElts != NumScalars) {
12415 if (NumElts != NumScalars &&
Offset == 0) {
12424 InsertMask[*InsertIdx] = *InsertIdx;
12425 if (!
Ins->hasOneUse())
12427 Ins = dyn_cast_or_null<InsertElementInst>(
12428 Ins->getUniqueUndroppableUser());
12431 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12433 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12436 if (!IsFirstPoison.
all()) {
12438 for (
unsigned I = 0;
I < NumElts;
I++) {
12440 IsFirstUndef.
test(
I)) {
12441 if (IsVNonPoisonous) {
12442 InsertMask[
I] =
I < NumScalars ?
I : 0;
12447 if (
Idx >= NumScalars)
12448 Idx = NumScalars - 1;
12449 InsertMask[
I] = NumScalars +
Idx;
12463 if (
auto *
I = dyn_cast<Instruction>(V)) {
12464 GatherShuffleExtractSeq.
insert(
I);
12465 CSEBlocks.
insert(
I->getParent());
12470 for (
unsigned I = 0;
I < NumElts;
I++) {
12475 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12478 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
12479 NumElts != NumScalars) {
12480 if (IsFirstUndef.
all()) {
12483 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12484 if (!IsFirstPoison.
all()) {
12485 for (
unsigned I = 0;
I < NumElts;
I++) {
12487 InsertMask[
I] =
I + NumElts;
12494 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
12495 if (
auto *
I = dyn_cast<Instruction>(V)) {
12496 GatherShuffleExtractSeq.
insert(
I);
12497 CSEBlocks.
insert(
I->getParent());
12502 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12503 for (
unsigned I = 0;
I < NumElts;
I++) {
12507 InsertMask[
I] += NumElts;
12510 FirstInsert->getOperand(0), V, InsertMask,
12511 cast<Instruction>(E->Scalars.back())->getName());
12512 if (
auto *
I = dyn_cast<Instruction>(V)) {
12513 GatherShuffleExtractSeq.
insert(
I);
12514 CSEBlocks.
insert(
I->getParent());
12519 ++NumVectorInstructions;
12520 E->VectorizedValue =
V;
12523 case Instruction::ZExt:
12524 case Instruction::SExt:
12525 case Instruction::FPToUI:
12526 case Instruction::FPToSI:
12527 case Instruction::FPExt:
12528 case Instruction::PtrToInt:
12529 case Instruction::IntToPtr:
12530 case Instruction::SIToFP:
12531 case Instruction::UIToFP:
12532 case Instruction::Trunc:
12533 case Instruction::FPTrunc:
12534 case Instruction::BitCast: {
12535 setInsertPointAfterBundle(E);
12537 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12538 if (E->VectorizedValue) {
12539 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12540 return E->VectorizedValue;
12543 auto *CI = cast<CastInst>(VL0);
12545 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
12546 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
12548 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
12551 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
12552 if (SrcIt != MinBWs.
end())
12553 SrcBWSz = SrcIt->second.first;
12554 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
12555 if (BWSz == SrcBWSz) {
12556 VecOpcode = Instruction::BitCast;
12557 }
else if (BWSz < SrcBWSz) {
12558 VecOpcode = Instruction::Trunc;
12559 }
else if (It != MinBWs.
end()) {
12560 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12561 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12562 }
else if (SrcIt != MinBWs.
end()) {
12563 assert(BWSz > SrcBWSz &&
"Invalid cast!");
12565 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12567 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
12568 !SrcIt->second.second) {
12569 VecOpcode = Instruction::UIToFP;
12571 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12573 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
12574 V = FinalShuffle(V, E, VecTy);
12576 E->VectorizedValue =
V;
12577 ++NumVectorInstructions;
12580 case Instruction::FCmp:
12581 case Instruction::ICmp: {
12582 setInsertPointAfterBundle(E);
12584 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
12585 if (E->VectorizedValue) {
12586 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12587 return E->VectorizedValue;
12589 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
12590 if (E->VectorizedValue) {
12591 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12592 return E->VectorizedValue;
12594 if (
L->getType() !=
R->getType()) {
12595 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12596 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12597 MinBWs.
contains(getOperandEntry(E, 0)) ||
12598 MinBWs.
contains(getOperandEntry(E, 1))) &&
12599 "Expected item in MinBWs.");
12600 if (cast<VectorType>(
L->getType())
12602 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
12604 ->getIntegerBitWidth()) {
12605 Type *CastTy =
R->getType();
12608 Type *CastTy =
L->getType();
12617 VecTy = cast<FixedVectorType>(
V->getType());
12618 V = FinalShuffle(V, E, VecTy);
12620 E->VectorizedValue =
V;
12621 ++NumVectorInstructions;
12624 case Instruction::Select: {
12625 setInsertPointAfterBundle(E);
12627 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
12628 if (E->VectorizedValue) {
12629 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12630 return E->VectorizedValue;
12632 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12633 if (E->VectorizedValue) {
12634 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12635 return E->VectorizedValue;
12637 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12638 if (E->VectorizedValue) {
12639 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12640 return E->VectorizedValue;
12644 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12645 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12646 MinBWs.
contains(getOperandEntry(E, 1)) ||
12647 MinBWs.
contains(getOperandEntry(E, 2))) &&
12648 "Expected item in MinBWs.");
12649 if (True->
getType() != VecTy)
12650 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
12651 if (False->
getType() != VecTy)
12652 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
12656 V = FinalShuffle(V, E, VecTy);
12658 E->VectorizedValue =
V;
12659 ++NumVectorInstructions;
12662 case Instruction::FNeg: {
12663 setInsertPointAfterBundle(E);
12665 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
12667 if (E->VectorizedValue) {
12668 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12669 return E->VectorizedValue;
12675 if (
auto *
I = dyn_cast<Instruction>(V))
12678 V = FinalShuffle(V, E, VecTy);
12680 E->VectorizedValue =
V;
12681 ++NumVectorInstructions;
12685 case Instruction::Add:
12686 case Instruction::FAdd:
12687 case Instruction::Sub:
12688 case Instruction::FSub:
12689 case Instruction::Mul:
12690 case Instruction::FMul:
12691 case Instruction::UDiv:
12692 case Instruction::SDiv:
12693 case Instruction::FDiv:
12694 case Instruction::URem:
12695 case Instruction::SRem:
12696 case Instruction::FRem:
12697 case Instruction::Shl:
12698 case Instruction::LShr:
12699 case Instruction::AShr:
12700 case Instruction::And:
12701 case Instruction::Or:
12702 case Instruction::Xor: {
12703 setInsertPointAfterBundle(E);
12705 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
12706 if (E->VectorizedValue) {
12707 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12708 return E->VectorizedValue;
12710 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
12711 if (E->VectorizedValue) {
12712 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12713 return E->VectorizedValue;
12717 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12718 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12719 MinBWs.
contains(getOperandEntry(E, 0)) ||
12720 MinBWs.
contains(getOperandEntry(E, 1))) &&
12721 "Expected item in MinBWs.");
12732 if (
auto *
I = dyn_cast<Instruction>(V)) {
12735 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
12737 return isCommutative(cast<Instruction>(V));
12739 I->setHasNoUnsignedWrap(
false);
12742 V = FinalShuffle(V, E, VecTy);
12744 E->VectorizedValue =
V;
12745 ++NumVectorInstructions;
12749 case Instruction::Load: {
12752 setInsertPointAfterBundle(E);
12754 LoadInst *LI = cast<LoadInst>(VL0);
12757 if (E->State == TreeEntry::Vectorize) {
12759 }
else if (E->State == TreeEntry::StridedVectorize) {
12760 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
12761 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
12762 PO = IsReverseOrder ? PtrN : Ptr0;
12768 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
12770 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
12771 DL->getTypeAllocSize(ScalarTy));
12775 return cast<LoadInst>(V)->getPointerOperand();
12778 std::optional<Value *> Stride =
12787 (IsReverseOrder ? -1 : 1) *
12788 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
12790 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12792 Intrinsic::experimental_vp_strided_load,
12793 {VecTy, PO->
getType(), StrideTy},
12795 Builder.
getInt32(E->Scalars.size())});
12801 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
12802 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
12803 if (E->VectorizedValue) {
12804 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12805 return E->VectorizedValue;
12808 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
12813 V = FinalShuffle(V, E, VecTy);
12814 E->VectorizedValue =
V;
12815 ++NumVectorInstructions;
12818 case Instruction::Store: {
12819 auto *
SI = cast<StoreInst>(VL0);
12821 setInsertPointAfterBundle(E);
12823 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
12824 if (VecValue->
getType() != VecTy)
12826 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
12827 VecValue = FinalShuffle(VecValue, E, VecTy);
12835 E->VectorizedValue =
V;
12836 ++NumVectorInstructions;
12839 case Instruction::GetElementPtr: {
12840 auto *GEP0 = cast<GetElementPtrInst>(VL0);
12841 setInsertPointAfterBundle(E);
12843 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
12844 if (E->VectorizedValue) {
12845 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12846 return E->VectorizedValue;
12850 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
12851 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
12852 if (E->VectorizedValue) {
12853 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12854 return E->VectorizedValue;
12859 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
12860 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
12862 for (
Value *V : E->Scalars) {
12863 if (isa<GetElementPtrInst>(V))
12869 V = FinalShuffle(V, E, VecTy);
12871 E->VectorizedValue =
V;
12872 ++NumVectorInstructions;
12876 case Instruction::Call: {
12877 CallInst *CI = cast<CallInst>(VL0);
12878 setInsertPointAfterBundle(E);
12884 It != MinBWs.
end() ? It->second.first : 0);
12887 VecCallCosts.first <= VecCallCosts.second;
12889 Value *ScalarArg =
nullptr;
12895 auto *CEI = cast<CallInst>(VL0);
12896 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
12901 ScalarArg = CEI->getArgOperand(
I);
12904 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
12905 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
12913 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
12914 if (E->VectorizedValue) {
12915 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12916 return E->VectorizedValue;
12918 ScalarArg = CEI->getArgOperand(
I);
12919 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
12921 It == MinBWs.
end()) {
12924 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
12925 }
else if (It != MinBWs.
end()) {
12926 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
12935 if (!UseIntrinsic) {
12951 V = FinalShuffle(V, E, VecTy);
12953 E->VectorizedValue =
V;
12954 ++NumVectorInstructions;
12957 case Instruction::ShuffleVector: {
12958 assert(E->isAltShuffle() &&
12963 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12964 "Invalid Shuffle Vector Operand");
12968 setInsertPointAfterBundle(E);
12969 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12970 if (E->VectorizedValue) {
12971 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12972 return E->VectorizedValue;
12974 RHS = vectorizeOperand(E, 1, PostponedPHIs);
12976 setInsertPointAfterBundle(E);
12977 LHS = vectorizeOperand(E, 0, PostponedPHIs);
12979 if (E->VectorizedValue) {
12980 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
12981 return E->VectorizedValue;
12988 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12989 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12990 MinBWs.
contains(getOperandEntry(E, 0)) ||
12991 MinBWs.
contains(getOperandEntry(E, 1))) &&
12992 "Expected item in MinBWs.");
12993 Type *CastTy = VecTy;
12997 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
12999 ->getIntegerBitWidth())
13016 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13017 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
13018 auto *AltCI = cast<CmpInst>(E->getAltOp());
13020 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
13023 unsigned SrcBWSz =
DL->getTypeSizeInBits(
13024 cast<VectorType>(
LHS->
getType())->getElementType());
13025 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
13026 if (BWSz <= SrcBWSz) {
13027 if (BWSz < SrcBWSz)
13030 if (
auto *
I = dyn_cast<Instruction>(LHS))
13032 E->VectorizedValue =
LHS;
13033 ++NumVectorInstructions;
13044 for (
Value *V : {V0, V1}) {
13045 if (
auto *
I = dyn_cast<Instruction>(V)) {
13046 GatherShuffleExtractSeq.
insert(
I);
13047 CSEBlocks.
insert(
I->getParent());
13056 E->buildAltOpShuffleMask(
13058 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
13062 Mask, &OpScalars, &AltScalars);
13066 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
13068 if (
auto *
I = dyn_cast<Instruction>(Vec);
13069 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
13071 auto *IV = cast<Instruction>(V);
13072 return IV->getOpcode() == Instruction::Sub &&
13073 isCommutative(cast<Instruction>(IV));
13075 I->setHasNoUnsignedWrap(
false);
13077 DropNuwFlag(V0, E->getOpcode());
13078 DropNuwFlag(V1, E->getAltOpcode());
13081 if (
auto *
I = dyn_cast<Instruction>(V)) {
13083 GatherShuffleExtractSeq.
insert(
I);
13084 CSEBlocks.
insert(
I->getParent());
13087 E->VectorizedValue =
V;
13088 ++NumVectorInstructions;
13101 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13107struct ShuffledInsertData {
13120 for (
auto &BSIter : BlocksSchedules) {
13121 scheduleBlock(BSIter.second.get());
13125 EntryToLastInstruction.
clear();
13135 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13136 if (TE->State == TreeEntry::Vectorize &&
13137 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13138 TE->VectorizedValue)
13144 for (
const TreeEntry *E : PostponedNodes) {
13145 auto *TE =
const_cast<TreeEntry *
>(E);
13146 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
13147 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13148 TE->UserTreeIndices.front().EdgeIdx)))
13152 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13153 TE->VectorizedValue =
nullptr;
13155 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13164 if (isa<PHINode>(UserI)) {
13167 for (
User *U : PrevVec->users()) {
13170 auto *UI = dyn_cast<Instruction>(U);
13171 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
13173 if (UI->comesBefore(InsertPt))
13182 if (Vec->
getType() != PrevVec->getType()) {
13184 PrevVec->getType()->isIntOrIntVectorTy() &&
13185 "Expected integer vector types only.");
13186 std::optional<bool> IsSigned;
13187 for (
Value *V : TE->Scalars) {
13188 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
13189 auto It = MinBWs.
find(BaseTE);
13190 if (It != MinBWs.
end()) {
13191 IsSigned = IsSigned.value_or(
false) || It->second.second;
13195 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
13196 auto It = MinBWs.
find(MNTE);
13197 if (It != MinBWs.
end()) {
13198 IsSigned = IsSigned.value_or(
false) || It->second.second;
13203 if (IsSigned.value_or(
false))
13206 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13207 auto It = MinBWs.
find(BVE);
13208 if (It != MinBWs.
end()) {
13209 IsSigned = IsSigned.value_or(
false) || It->second.second;
13214 if (IsSigned.value_or(
false))
13216 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
13218 IsSigned.value_or(
false) ||
13222 if (IsSigned.value_or(
false))
13226 if (IsSigned.value_or(
false)) {
13228 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
13229 if (It != MinBWs.
end())
13230 IsSigned = It->second.second;
13233 "Expected user node or perfect diamond match in MinBWs.");
13237 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
13240 auto It = PostponedValues.
find(PrevVec);
13241 if (It != PostponedValues.
end()) {
13242 for (TreeEntry *VTE : It->getSecond())
13243 VTE->VectorizedValue = Vec;
13263 for (
const auto &ExternalUse : ExternalUses) {
13264 Value *Scalar = ExternalUse.Scalar;
13271 TreeEntry *E = getTreeEntry(Scalar);
13272 assert(E &&
"Invalid scalar");
13273 assert(E->State != TreeEntry::NeedToGather &&
13274 "Extracting from a gather list");
13276 if (E->getOpcode() == Instruction::GetElementPtr &&
13277 !isa<GetElementPtrInst>(Scalar))
13280 Value *Vec = E->VectorizedValue;
13281 assert(Vec &&
"Can't find vectorizable value");
13284 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
13285 if (Scalar->getType() != Vec->
getType()) {
13286 Value *Ex =
nullptr;
13287 Value *ExV =
nullptr;
13288 auto *
GEP = dyn_cast<GetElementPtrInst>(Scalar);
13290 auto It = ScalarToEEs.find(Scalar);
13291 if (It != ScalarToEEs.end()) {
13295 if (EEIt != It->second.end()) {
13301 if (
auto *CI = EEIt->second.second)
13305 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13310 if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13311 Value *V = ES->getVectorOperand();
13312 if (
const TreeEntry *ETE = getTreeEntry(V))
13313 V = ETE->VectorizedValue;
13315 }
else if (ReplaceGEP) {
13318 auto *CloneGEP =
GEP->clone();
13321 if (
GEP->hasName())
13322 CloneGEP->takeName(
GEP);
13330 if (Scalar->getType() != Ex->
getType())
13332 MinBWs.
find(E)->second.second);
13333 if (
auto *
I = dyn_cast<Instruction>(Ex))
13334 ScalarToEEs[Scalar].try_emplace(
13336 std::make_pair(
I, cast<Instruction>(ExV)));
13340 if (
auto *ExI = dyn_cast<Instruction>(Ex)) {
13341 GatherShuffleExtractSeq.
insert(ExI);
13342 CSEBlocks.
insert(ExI->getParent());
13346 assert(isa<FixedVectorType>(Scalar->getType()) &&
13347 isa<InsertElementInst>(Scalar) &&
13348 "In-tree scalar of vector type is not insertelement?");
13349 auto *IE = cast<InsertElementInst>(Scalar);
13357 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
13362 if (ExternalUsesAsGEPs.contains(U))
13364 TreeEntry *UseEntry = getTreeEntry(U);
13366 (UseEntry->State == TreeEntry::Vectorize ||
13368 TreeEntry::StridedVectorize) &&
13369 (E->State == TreeEntry::Vectorize ||
13370 E->State == TreeEntry::StridedVectorize) &&
13371 doesInTreeUserNeedToExtract(
13373 cast<Instruction>(UseEntry->Scalars.front()),
13376 "Scalar with nullptr User must be registered in "
13377 "ExternallyUsedValues map or remain as scalar in vectorized "
13379 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13380 if (
auto *
PHI = dyn_cast<PHINode>(VecI))
13382 PHI->getParent()->getFirstNonPHIIt());
13385 std::next(VecI->getIterator()));
13389 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13391 Scalar->replaceAllUsesWith(NewInst);
13392 ReplacedExternals.emplace_back(Scalar, NewInst);
13396 if (
auto *VU = dyn_cast<InsertElementInst>(
User)) {
13398 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13399 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
13400 if (!UsedInserts.
insert(VU).second)
13403 auto BWIt = MinBWs.
find(E);
13405 auto *ScalarTy = FTy->getElementType();
13406 auto Key = std::make_pair(Vec, ScalarTy);
13407 auto VecIt = VectorCasts.
find(Key);
13408 if (VecIt == VectorCasts.
end()) {
13410 if (
auto *IVec = dyn_cast<Instruction>(Vec))
13416 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
13417 BWIt->second.second);
13420 Vec = VecIt->second;
13427 find_if(ShuffledInserts, [VU](
const ShuffledInsertData &
Data) {
13434 unsigned Idx = *InsertIdx;
13435 if (It == ShuffledInserts.
end()) {
13437 It = std::next(ShuffledInserts.
begin(),
13438 ShuffledInserts.
size() - 1);
13444 while (
auto *IEBase = dyn_cast<InsertElementInst>(
Base)) {
13445 if (IEBase !=
User &&
13446 (!IEBase->hasOneUse() ||
13450 if (
const TreeEntry *E = getTreeEntry(IEBase)) {
13452 IEBase = cast<InsertElementInst>(
Base);
13455 "InsertElementInstruction used already.");
13456 Mask[IEIdx] = IEIdx;
13457 Base = IEBase->getOperand(0);
13458 }
while (E == getTreeEntry(
Base));
13461 Base = cast<InsertElementInst>(
Base)->getOperand(0);
13465 auto It = VectorToInsertElement.
find(
Base);
13466 if (It != VectorToInsertElement.
end())
13473 Mask[
Idx] = ExternalUse.Lane;
13474 It->InsertElements.push_back(cast<InsertElementInst>(
User));
13483 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
13485 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13486 if (PH->getIncomingValue(
I) == Scalar) {
13488 PH->getIncomingBlock(
I)->getTerminator();
13489 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13491 std::next(VecI->getIterator()));
13495 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13496 PH->setOperand(
I, NewInst);
13501 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13506 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13516 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13517 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
13519 CombinedMask1[
I] = Mask[
I];
13521 CombinedMask2[
I] = Mask[
I] - VF;
13524 ShuffleBuilder.
add(V1, CombinedMask1);
13526 ShuffleBuilder.
add(V2, CombinedMask2);
13527 return ShuffleBuilder.
finalize(std::nullopt);
13531 bool ForSingleMask) {
13532 unsigned VF = Mask.size();
13533 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
13535 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
13536 Vec = CreateShuffle(Vec,
nullptr, Mask);
13537 return std::make_pair(Vec,
true);
13539 if (!ForSingleMask) {
13541 for (
unsigned I = 0;
I < VF; ++
I) {
13543 ResizeMask[Mask[
I]] = Mask[
I];
13545 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
13549 return std::make_pair(Vec,
false);
13553 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
13559 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
13560 Value *NewInst = performExtractsShuffleAction<Value>(
13564 return cast<VectorType>(Vec->getType())
13565 ->getElementCount()
13566 .getKnownMinValue();
13571 assert((Vals.size() == 1 || Vals.size() == 2) &&
13572 "Expected exactly 1 or 2 input values.");
13573 if (Vals.size() == 1) {
13576 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13577 ->getNumElements() ||
13578 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13579 return CreateShuffle(Vals.front(), nullptr, Mask);
13580 return Vals.front();
13582 return CreateShuffle(Vals.
front() ? Vals.
front()
13584 Vals.
back(), Mask);
13586 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
13589 if (It != ShuffledInserts[
I].InsertElements.
rend())
13592 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
13593 assert(II &&
"Must be an insertelement instruction.");
13597 Inserts.
push_back(cast<Instruction>(II));
13598 II = dyn_cast<InsertElementInst>(II->
getOperand(0));
13602 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
13609 IE->replaceUsesOfWith(IE->getOperand(0),
13611 IE->replaceUsesOfWith(IE->getOperand(1),
13615 CSEBlocks.
insert(LastInsert->getParent());
13620 for (
auto &TEPtr : VectorizableTree) {
13621 TreeEntry *Entry = TEPtr.get();
13624 if (Entry->State == TreeEntry::NeedToGather)
13627 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
13630 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13631 Value *Scalar = Entry->Scalars[Lane];
13633 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13634 !isa<GetElementPtrInst>(Scalar))
13637 Type *Ty = Scalar->getType();
13639 for (
User *U : Scalar->users()) {
13643 assert((getTreeEntry(U) ||
13644 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13645 (isa_and_nonnull<Instruction>(U) &&
13646 isDeleted(cast<Instruction>(U)))) &&
13647 "Deleting out-of-tree value");
13651 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
13656 RemovedInsts.
push_back(cast<Instruction>(Scalar));
13662 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
13663 V->mergeDIAssignID(RemovedInsts);
13666 InstrElementSize.
clear();
13668 const TreeEntry &RootTE = *VectorizableTree.front().get();
13669 Value *Vec = RootTE.VectorizedValue;
13670 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
13671 It != MinBWs.end() &&
13672 ReductionBitWidth != It->second.first) {
13675 ReductionRoot->getIterator());
13679 cast<VectorType>(Vec->
getType())->getElementCount()),
13680 It->second.second);
13687 <<
" gather sequences instructions.\n");
13694 Loop *L = LI->getLoopFor(
I->getParent());
13699 BasicBlock *PreHeader = L->getLoopPreheader();
13707 auto *OpI = dyn_cast<Instruction>(V);
13708 return OpI && L->contains(OpI);
13714 CSEBlocks.
insert(PreHeader);
13729 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
13730 "Different nodes should have different DFS numbers");
13731 return A->getDFSNumIn() <
B->getDFSNumIn();
13741 if (I1->getType() != I2->getType())
13743 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
13744 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
13746 return I1->isIdenticalTo(I2);
13747 if (SI1->isIdenticalTo(SI2))
13749 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
13750 if (SI1->getOperand(
I) != SI2->getOperand(
I))
13753 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
13757 unsigned LastUndefsCnt = 0;
13758 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
13764 NewMask[
I] != SM1[
I])
13767 NewMask[
I] = SM1[
I];
13771 return SM1.
size() - LastUndefsCnt > 1 &&
13775 SM1.
size() - LastUndefsCnt));
13781 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
13784 "Worklist not sorted properly!");
13790 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
13791 !GatherShuffleExtractSeq.contains(&In))
13796 bool Replaced =
false;
13799 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13800 DT->
dominates(V->getParent(), In.getParent())) {
13801 In.replaceAllUsesWith(V);
13803 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
13804 if (!NewMask.
empty())
13805 SI->setShuffleMask(NewMask);
13809 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
13810 GatherShuffleExtractSeq.contains(V) &&
13811 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13812 DT->
dominates(In.getParent(), V->getParent())) {
13814 V->replaceAllUsesWith(&In);
13816 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
13817 if (!NewMask.
empty())
13818 SI->setShuffleMask(NewMask);
13826 Visited.push_back(&In);
13831 GatherShuffleExtractSeq.clear();
13834BoUpSLP::ScheduleData *
13836 ScheduleData *Bundle =
nullptr;
13837 ScheduleData *PrevInBundle =
nullptr;
13838 for (
Value *V : VL) {
13841 ScheduleData *BundleMember = getScheduleData(V);
13843 "no ScheduleData for bundle member "
13844 "(maybe not in same basic block)");
13845 assert(BundleMember->isSchedulingEntity() &&
13846 "bundle member already part of other bundle");
13847 if (PrevInBundle) {
13848 PrevInBundle->NextInBundle = BundleMember;
13850 Bundle = BundleMember;
13854 BundleMember->FirstInBundle = Bundle;
13855 PrevInBundle = BundleMember;
13857 assert(Bundle &&
"Failed to find schedule bundle");
13863std::optional<BoUpSLP::ScheduleData *>
13865 const InstructionsState &S) {
13876 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
13877 ScheduleData *Bundle) {
13883 if (ScheduleEnd != OldScheduleEnd) {
13884 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
13885 doForAllOpcodes(
I, [](ScheduleData *SD) { SD->clearDependencies(); });
13890 <<
" in block " << BB->
getName() <<
"\n");
13891 calculateDependencies(Bundle,
true, SLP);
13896 initialFillReadyList(ReadyInsts);
13903 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13904 !ReadyInsts.empty()) {
13905 ScheduleData *Picked = ReadyInsts.pop_back_val();
13906 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13907 "must be ready to schedule");
13908 schedule(Picked, ReadyInsts);
13914 for (
Value *V : VL) {
13917 if (!extendSchedulingRegion(V, S)) {
13924 TryScheduleBundleImpl(
false,
nullptr);
13925 return std::nullopt;
13929 bool ReSchedule =
false;
13930 for (
Value *V : VL) {
13933 ScheduleData *BundleMember = getScheduleData(V);
13935 "no ScheduleData for bundle member (maybe not in same basic block)");
13939 ReadyInsts.remove(BundleMember);
13941 if (!BundleMember->IsScheduled)
13946 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
13947 <<
" was already scheduled\n");
13951 auto *Bundle = buildBundle(VL);
13952 TryScheduleBundleImpl(ReSchedule, Bundle);
13953 if (!Bundle->isReady()) {
13954 cancelScheduling(VL, S.OpValue);
13955 return std::nullopt;
13968 ScheduleData *Bundle = getScheduleData(OpValue);
13969 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
13970 assert(!Bundle->IsScheduled &&
13971 "Can't cancel bundle which is already scheduled");
13972 assert(Bundle->isSchedulingEntity() &&
13974 "tried to unbundle something which is not a bundle");
13977 if (Bundle->isReady())
13978 ReadyInsts.remove(Bundle);
13981 ScheduleData *BundleMember = Bundle;
13982 while (BundleMember) {
13983 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
13984 BundleMember->FirstInBundle = BundleMember;
13985 ScheduleData *Next = BundleMember->NextInBundle;
13986 BundleMember->NextInBundle =
nullptr;
13987 BundleMember->TE =
nullptr;
13988 if (BundleMember->unscheduledDepsInBundle() == 0) {
13989 ReadyInsts.insert(BundleMember);
13991 BundleMember = Next;
13995BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13997 if (ChunkPos >= ChunkSize) {
13998 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14001 return &(ScheduleDataChunks.back()[ChunkPos++]);
14004bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
Value *V,
14005 const InstructionsState &S) {
14006 if (getScheduleData(V,
isOneOf(S, V)))
14009 assert(
I &&
"bundle member must be an instruction");
14012 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14014 auto &&CheckScheduleForI = [
this, &S](
Instruction *
I) ->
bool {
14015 ScheduleData *ISD = getScheduleData(
I);
14018 assert(isInSchedulingRegion(ISD) &&
14019 "ScheduleData not in scheduling region");
14020 ScheduleData *SD = allocateScheduleDataChunks();
14022 SD->init(SchedulingRegionID, S.OpValue);
14023 ExtraScheduleDataMap[
I][S.OpValue] = SD;
14026 if (CheckScheduleForI(
I))
14028 if (!ScheduleStart) {
14030 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
14032 ScheduleEnd =
I->getNextNode();
14034 CheckScheduleForI(
I);
14035 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14036 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
14044 ++ScheduleStart->getIterator().getReverse();
14049 if (
auto *II = dyn_cast<IntrinsicInst>(&
I))
14050 return II->isAssumeLikeIntrinsic();
14053 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14054 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14055 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
14057 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14058 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
14065 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14066 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14068 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
14069 assert(
I->getParent() == ScheduleStart->getParent() &&
14070 "Instruction is in wrong basic block.");
14071 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
14074 CheckScheduleForI(
I);
14079 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
14080 "Expected to reach top of the basic block or instruction down the "
14082 assert(
I->getParent() == ScheduleEnd->getParent() &&
14083 "Instruction is in wrong basic block.");
14084 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
14086 ScheduleEnd =
I->getNextNode();
14088 CheckScheduleForI(
I);
14089 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
14090 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
14094void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
14096 ScheduleData *PrevLoadStore,
14097 ScheduleData *NextLoadStore) {
14098 ScheduleData *CurrentLoadStore = PrevLoadStore;
14103 ScheduleData *SD = ScheduleDataMap.lookup(
I);
14105 SD = allocateScheduleDataChunks();
14106 ScheduleDataMap[
I] = SD;
14109 assert(!isInSchedulingRegion(SD) &&
14110 "new ScheduleData already in scheduling region");
14111 SD->init(SchedulingRegionID,
I);
14113 if (
I->mayReadOrWriteMemory() &&
14114 (!isa<IntrinsicInst>(
I) ||
14115 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
14116 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
14117 Intrinsic::pseudoprobe))) {
14119 if (CurrentLoadStore) {
14120 CurrentLoadStore->NextLoadStore = SD;
14122 FirstLoadStoreInRegion = SD;
14124 CurrentLoadStore = SD;
14127 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14128 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14129 RegionHasStackSave =
true;
14131 if (NextLoadStore) {
14132 if (CurrentLoadStore)
14133 CurrentLoadStore->NextLoadStore = NextLoadStore;
14135 LastLoadStoreInRegion = CurrentLoadStore;
14139void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14140 bool InsertInReadyList,
14142 assert(SD->isSchedulingEntity());
14147 while (!WorkList.
empty()) {
14149 for (ScheduleData *BundleMember = SD; BundleMember;
14150 BundleMember = BundleMember->NextInBundle) {
14151 assert(isInSchedulingRegion(BundleMember));
14152 if (BundleMember->hasValidDependencies())
14157 BundleMember->Dependencies = 0;
14158 BundleMember->resetUnscheduledDeps();
14161 if (BundleMember->OpValue != BundleMember->Inst) {
14162 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14163 BundleMember->Dependencies++;
14164 ScheduleData *DestBundle = UseSD->FirstInBundle;
14165 if (!DestBundle->IsScheduled)
14166 BundleMember->incrementUnscheduledDeps(1);
14167 if (!DestBundle->hasValidDependencies())
14171 for (
User *U : BundleMember->Inst->
users()) {
14172 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14173 BundleMember->Dependencies++;
14174 ScheduleData *DestBundle = UseSD->FirstInBundle;
14175 if (!DestBundle->IsScheduled)
14176 BundleMember->incrementUnscheduledDeps(1);
14177 if (!DestBundle->hasValidDependencies())
14184 auto *DepDest = getScheduleData(
I);
14185 assert(DepDest &&
"must be in schedule window");
14186 DepDest->ControlDependencies.push_back(BundleMember);
14187 BundleMember->Dependencies++;
14188 ScheduleData *DestBundle = DepDest->FirstInBundle;
14189 if (!DestBundle->IsScheduled)
14190 BundleMember->incrementUnscheduledDeps(1);
14191 if (!DestBundle->hasValidDependencies())
14199 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14200 I != ScheduleEnd;
I =
I->getNextNode()) {
14205 MakeControlDependent(
I);
14213 if (RegionHasStackSave) {
14217 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14218 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14219 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14220 I != ScheduleEnd;
I =
I->getNextNode()) {
14221 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
14222 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14227 if (!isa<AllocaInst>(
I))
14231 MakeControlDependent(
I);
14240 if (isa<AllocaInst>(BundleMember->Inst) ||
14241 BundleMember->Inst->mayReadOrWriteMemory()) {
14242 for (
Instruction *
I = BundleMember->Inst->getNextNode();
14243 I != ScheduleEnd;
I =
I->getNextNode()) {
14244 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
14245 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
14249 MakeControlDependent(
I);
14256 ScheduleData *DepDest = BundleMember->NextLoadStore;
14261 "NextLoadStore list for non memory effecting bundle?");
14263 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14264 unsigned NumAliased = 0;
14265 unsigned DistToSrc = 1;
14267 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14268 assert(isInSchedulingRegion(DepDest));
14278 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14280 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14287 DepDest->MemoryDependencies.push_back(BundleMember);
14288 BundleMember->Dependencies++;
14289 ScheduleData *DestBundle = DepDest->FirstInBundle;
14290 if (!DestBundle->IsScheduled) {
14291 BundleMember->incrementUnscheduledDeps(1);
14293 if (!DestBundle->hasValidDependencies()) {
14316 if (InsertInReadyList && SD->isReady()) {
14317 ReadyInsts.insert(SD);
14324void BoUpSLP::BlockScheduling::resetSchedule() {
14326 "tried to reset schedule on block which has not been scheduled");
14327 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
14328 doForAllOpcodes(
I, [&](ScheduleData *SD) {
14329 assert(isInSchedulingRegion(SD) &&
14330 "ScheduleData not in scheduling region");
14331 SD->IsScheduled =
false;
14332 SD->resetUnscheduledDeps();
14335 ReadyInsts.clear();
14338void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14339 if (!BS->ScheduleStart)
14342 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
14349 BS->resetSchedule();
14356 struct ScheduleDataCompare {
14357 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
14358 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14361 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14366 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
14367 I =
I->getNextNode()) {
14368 BS->doForAllOpcodes(
I, [
this, &
Idx, BS](ScheduleData *SD) {
14369 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14372 SD->isPartOfBundle() ==
14374 "scheduler and vectorizer bundle mismatch");
14375 SD->FirstInBundle->SchedulingPriority =
Idx++;
14377 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14378 BS->calculateDependencies(SD,
false,
this);
14381 BS->initialFillReadyList(ReadyInsts);
14383 Instruction *LastScheduledInst = BS->ScheduleEnd;
14386 while (!ReadyInsts.empty()) {
14387 ScheduleData *Picked = *ReadyInsts.begin();
14388 ReadyInsts.erase(ReadyInsts.begin());
14392 for (ScheduleData *BundleMember = Picked; BundleMember;
14393 BundleMember = BundleMember->NextInBundle) {
14397 LastScheduledInst = PickedInst;
14400 BS->schedule(Picked, ReadyInsts);
14404#ifdef EXPENSIVE_CHECKS
14408#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14410 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
14411 BS->doForAllOpcodes(
I, [&](ScheduleData *SD) {
14412 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14413 assert(SD->IsScheduled &&
"must be scheduled at this point");
14420 BS->ScheduleStart =
nullptr;
14427 if (
auto *Store = dyn_cast<StoreInst>(V))
14428 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14430 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
14433 auto E = InstrElementSize.
find(V);
14434 if (E != InstrElementSize.
end())
14443 if (
auto *
I = dyn_cast<Instruction>(V)) {
14451 Value *FirstNonBool =
nullptr;
14452 while (!Worklist.
empty()) {
14457 auto *Ty =
I->getType();
14458 if (isa<VectorType>(Ty))
14460 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
14467 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
14468 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
14476 for (
Use &U :
I->operands()) {
14477 if (
auto *J = dyn_cast<Instruction>(U.get()))
14478 if (Visited.
insert(J).second &&
14479 (isa<PHINode>(
I) || J->getParent() == Parent)) {
14483 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
14484 FirstNonBool = U.get();
14495 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
14497 Width =
DL->getTypeSizeInBits(V->getType());
14501 InstrElementSize[
I] = Width;
14506bool BoUpSLP::collectValuesToDemote(
14507 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
14509 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
14510 bool IsTruncRoot)
const {
14512 if (
all_of(E.Scalars, IsaPred<Constant>))
14515 unsigned OrigBitWidth =
DL->getTypeSizeInBits(E.Scalars.front()->getType());
14524 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
14533 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14537 if (
auto *
I = dyn_cast<Instruction>(V)) {
14539 unsigned BitWidth2 =
14540 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14541 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14547 BitWidth1 = std::min(BitWidth1, BitWidth2);
14552 using namespace std::placeholders;
14553 auto FinalAnalysis = [&]() {
14554 if (!IsProfitableToDemote)
14557 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
14559 if (Res && E.State == TreeEntry::NeedToGather &&
14560 all_of(E.Scalars, IsaPred<Constant>))
14565 if (E.State == TreeEntry::NeedToGather || !Visited.
insert(&E).second ||
14567 return all_of(V->users(), [&](User *U) {
14568 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14571 return FinalAnalysis();
14574 return !all_of(V->users(), [=](User *U) {
14575 return getTreeEntry(U) ||
14576 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14577 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14578 !U->getType()->isScalableTy() &&
14579 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14580 }) && !IsPotentiallyTruncated(V,
BitWidth);
14585 bool &NeedToExit) {
14586 NeedToExit =
false;
14587 unsigned InitLevel = MaxDepthLevel;
14589 unsigned Level = InitLevel;
14590 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
14591 ToDemote, Visited, Level, IsProfitableToDemote,
14593 if (!IsProfitableToDemote)
14596 if (!FinalAnalysis())
14600 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14604 auto AttemptCheckBitwidth =
14607 NeedToExit =
false;
14608 unsigned BestFailBitwidth = 0;
14610 if (Checker(
BitWidth, OrigBitWidth))
14612 if (BestFailBitwidth == 0 && FinalAnalysis())
14616 if (BestFailBitwidth == 0) {
14627 auto TryProcessInstruction =
14634 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14639 if (E.UserTreeIndices.size() > 1 &&
14640 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
14643 bool NeedToExit =
false;
14644 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14648 if (!ProcessOperands(
Operands, NeedToExit))
14657 return IsProfitableToDemote;
14659 switch (E.getOpcode()) {
14663 case Instruction::Trunc:
14664 if (IsProfitableToDemoteRoot)
14665 IsProfitableToDemote =
true;
14666 return TryProcessInstruction(
BitWidth);
14667 case Instruction::ZExt:
14668 case Instruction::SExt:
14669 IsProfitableToDemote =
true;
14670 return TryProcessInstruction(
BitWidth);
14674 case Instruction::Add:
14675 case Instruction::Sub:
14676 case Instruction::Mul:
14677 case Instruction::And:
14678 case Instruction::Or:
14679 case Instruction::Xor: {
14680 return TryProcessInstruction(
14681 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
14683 case Instruction::Shl: {
14688 auto *I = cast<Instruction>(V);
14689 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14690 return AmtKnownBits.getMaxValue().ult(BitWidth);
14693 return TryProcessInstruction(
14694 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
14696 case Instruction::LShr: {
14700 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14702 auto *I = cast<Instruction>(V);
14703 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14704 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14705 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14706 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
14707 SimplifyQuery(*DL));
14710 return TryProcessInstruction(
14711 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14714 case Instruction::AShr: {
14718 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14720 auto *I = cast<Instruction>(V);
14721 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
14722 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14723 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
14724 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14728 return TryProcessInstruction(
14729 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
14732 case Instruction::UDiv:
14733 case Instruction::URem: {
14735 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14738 auto *I = cast<Instruction>(V);
14739 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14740 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14741 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14744 return TryProcessInstruction(
14745 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
14749 case Instruction::Select: {
14750 return TryProcessInstruction(
14751 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
14756 case Instruction::PHI: {
14757 const unsigned NumOps = E.getNumOperands();
14760 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
14762 return TryProcessInstruction(
BitWidth, Ops);
14765 case Instruction::Call: {
14766 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
14770 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
14771 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
14775 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
14778 auto *I = cast<Instruction>(V);
14779 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14780 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14781 return MaskedValueIsZero(I->getOperand(0), Mask,
14782 SimplifyQuery(*DL)) &&
14783 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14785 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
14786 "Expected min/max intrinsics only.");
14787 unsigned SignBits = OrigBitWidth -
BitWidth;
14800 if (
ID != Intrinsic::abs) {
14801 Operands.push_back(getOperandEntry(&E, 1));
14802 CallChecker = CompChecker;
14805 std::numeric_limits<InstructionCost::CostType>::max();
14807 unsigned VF = E.Scalars.size();
14817 if (
Cost < BestCost) {
14823 [[maybe_unused]]
bool NeedToExit;
14824 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14834 return FinalAnalysis();
14841 bool IsStoreOrInsertElt =
14842 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14843 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14844 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14845 ExtraBitWidthNodes.
size() <= 1 &&
14846 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14847 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14850 unsigned NodeIdx = 0;
14851 if (IsStoreOrInsertElt &&
14852 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14856 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14857 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
14858 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
14861 static_cast<int>(NodeIdx);
14867 bool IsTruncRoot =
false;
14868 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14870 if (NodeIdx != 0 &&
14871 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14872 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
14873 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
14874 IsTruncRoot =
true;
14876 IsProfitableToDemoteRoot =
true;
14881 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
14885 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
14886 bool IsProfitableToDemoteRoot,
unsigned Opcode,
14887 unsigned Limit,
bool IsTruncRoot,
14888 bool IsSignedCmp) {
14890 unsigned VF = E.getVectorFactor();
14891 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
14892 if (!TreeRootIT || !Opcode)
14896 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
14899 unsigned NumParts =
14905 unsigned MaxBitWidth = 1u;
14913 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
14914 KnownBits Known = computeKnownBits(R, *DL);
14915 return Known.isNonNegative();
14920 for (
Value *Root : E.Scalars) {
14923 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14939 if (!IsKnownPositive)
14943 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14945 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
14948 if (MaxBitWidth < 8 && MaxBitWidth > 1)
14953 if (NumParts > 1 &&
14959 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
14960 Opcode == Instruction::SExt ||
14961 Opcode == Instruction::ZExt || NumParts > 1;
14966 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
14967 bool NeedToDemote = IsProfitableToDemote;
14969 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
14970 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
14972 (MaxDepthLevel <= Limit &&
14973 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
14974 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
14975 DL->getTypeSizeInBits(TreeRootIT) /
14976 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
14982 MaxBitWidth =
bit_ceil(MaxBitWidth);
14984 return MaxBitWidth;
14991 if (UserIgnoreList &&
14992 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
14993 for (
Value *V : *UserIgnoreList) {
14995 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
14996 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14999 unsigned BitWidth2 = BitWidth1;
15002 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15004 ReductionBitWidth =
15005 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15007 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15008 ReductionBitWidth = 8;
15010 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
15012 bool IsTopRoot = NodeIdx == 0;
15013 while (NodeIdx < VectorizableTree.size() &&
15014 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15015 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15018 IsTruncRoot =
true;
15020 bool IsSignedCmp =
false;
15021 while (NodeIdx < VectorizableTree.size()) {
15023 unsigned Limit = 2;
15024 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15026 ReductionBitWidth ==
15027 DL->getTypeSizeInBits(
15028 VectorizableTree.front()->Scalars.front()->getType()))
15030 unsigned MaxBitWidth = ComputeMaxBitWidth(
15031 *VectorizableTree[NodeIdx].
get(), IsTopRoot, IsProfitableToDemoteRoot,
15032 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15033 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
15034 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15035 ReductionBitWidth =
bit_ceil(MaxBitWidth);
15036 else if (MaxBitWidth == 0)
15037 ReductionBitWidth = 0;
15040 for (
unsigned Idx : RootDemotes) {
15042 uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType());
15043 if (OrigBitWidth > MaxBitWidth) {
15051 RootDemotes.clear();
15053 IsProfitableToDemoteRoot =
true;
15055 if (ExtraBitWidthNodes.
empty()) {
15056 NodeIdx = VectorizableTree.size();
15058 unsigned NewIdx = 0;
15060 NewIdx = *ExtraBitWidthNodes.
begin();
15061 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
15062 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
15065 NodeIdx < VectorizableTree.size() &&
15066 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15069 EI.
UserTE->getOpcode() == Instruction::Trunc &&
15070 !EI.
UserTE->isAltShuffle();
15073 NodeIdx < VectorizableTree.size() &&
15074 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15076 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
15078 auto *IC = dyn_cast<ICmpInst>(V);
15079 return IC && IC->isSigned();
15086 if (MaxBitWidth == 0 ||
15088 cast<IntegerType>(TreeRoot.
front()->getType())->getBitWidth()) {
15089 if (UserIgnoreList)
15096 for (
unsigned Idx : ToDemote) {
15097 TreeEntry *TE = VectorizableTree[
Idx].get();
15100 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15102 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15120 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
15145 DL = &
F.getParent()->getDataLayout();
15149 bool Changed =
false;
15155 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
15160 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
15163 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
15167 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
15173 DT->updateDFSNumbers();
15176 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
15178 R.clearReductionData();
15179 collectSeedInstructions(BB);
15182 if (!Stores.empty()) {
15184 <<
" underlying objects.\n");
15185 Changed |= vectorizeStoreChains(R);
15189 Changed |= vectorizeChainsInBlock(BB, R);
15194 if (!GEPs.
empty()) {
15196 <<
" underlying objects.\n");
15197 Changed |= vectorizeGEPIndices(BB, R);
15202 R.optimizeGatherSequence();
15209 unsigned Idx,
unsigned MinVF) {
15212 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15213 unsigned VF = Chain.
size();
15226 R.buildTree(Chain);
15227 if (R.isTreeTinyAndNotFullyVectorizable())
15229 if (R.isLoadCombineCandidate())
15231 R.reorderTopToBottom();
15232 R.reorderBottomToTop();
15233 R.buildExternalUses();
15235 R.computeMinimumValueSizes();
15236 R.transformNodes();
15244 using namespace ore;
15247 cast<StoreInst>(Chain[0]))
15248 <<
"Stores SLP vectorized with cost " << NV(
"Cost",
Cost)
15249 <<
" and with tree size "
15250 << NV(
"TreeSize", R.getTreeSize()));
15264 bool Changed =
false;
15270 struct StoreDistCompare {
15271 bool operator()(
const std::pair<unsigned, int> &Op1,
15272 const std::pair<unsigned, int> &Op2)
const {
15273 return Op1.second < Op2.second;
15278 using StoreIndexToDistSet =
15279 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15280 auto TryToVectorize = [&](
const StoreIndexToDistSet &Set) {
15285 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
15287 PrevDist =
Data.second;
15288 if (
Idx != Set.size() - 1)
15293 Operands.push_back(Stores[DataVar.first]);
15294 PrevDist = DataVar.second;
15300 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
15301 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
15305 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15307 Type *StoreTy =
Store->getValueOperand()->getType();
15308 Type *ValueTy = StoreTy;
15309 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
15310 ValueTy = Trunc->getSrcTy();
15312 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
15314 if (MaxVF < MinVF) {
15315 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
15317 <<
"MinVF (" << MinVF <<
")\n");
15321 unsigned NonPowerOf2VF = 0;
15326 unsigned CandVF =
Operands.size();
15328 NonPowerOf2VF = CandVF;
15333 unsigned Size = MinVF;
15335 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
15338 unsigned StartIdx = 0;
15339 for (
unsigned Size : CandidateVFs) {
15340 for (
unsigned Cnt = StartIdx, E =
Operands.size(); Cnt +
Size <= E;) {
15346 return cast<StoreInst>(V)->getValueOperand()->getType() ==
15347 cast<StoreInst>(Slice.
front())
15348 ->getValueOperand()
15351 "Expected all operands of same type.");
15352 if (!VectorizedStores.
count(Slice.
front()) &&
15353 !VectorizedStores.
count(Slice.
back()) &&
15356 vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
15362 if (Cnt == StartIdx)
15418 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15420 Stores[Set.first]->getValueOperand()->getType(),
15421 Stores[Set.first]->getPointerOperand(),
15422 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
15426 auto It = Set.second.find(std::make_pair(
Idx, *Diff));
15427 if (It == Set.second.end()) {
15428 Set.second.emplace(
Idx, *Diff);
15432 TryToVectorize(Set.second);
15433 StoreIndexToDistSet PrevSet;
15434 PrevSet.swap(Set.second);
15436 Set.second.emplace(
Idx, 0);
15439 unsigned StartIdx = It->first + 1;
15444 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
15446 if (Pair.first <= It->first ||
15447 VectorizedStores.
contains(Stores[Pair.first]))
15449 unsigned BI = Pair.first - StartIdx;
15450 UsedStores.set(BI);
15451 Dists[BI] = Pair.second - It->second;
15453 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
15454 unsigned BI =
I - StartIdx;
15455 if (UsedStores.test(BI))
15456 Set.second.emplace(
I, Dists[BI]);
15460 auto &Res = SortedStores.emplace_back();
15462 Res.second.emplace(
Idx, 0);
15468 SI->getValueOperand()->getType()) {
15469 for (
auto &Set : SortedStores)
15470 TryToVectorize(Set.second);
15471 SortedStores.clear();
15474 FillStoresSet(
I, SI);
15478 for (
auto &Set : SortedStores)
15479 TryToVectorize(Set.second);
15484void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
15495 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
15496 if (!
SI->isSimple())
15506 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
15507 if (
GEP->getNumIndices() != 1)
15510 if (isa<Constant>(
Idx))
15514 if (
GEP->getType()->isVectorTy())
15526 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
15527 << VL.
size() <<
".\n");
15532 if (!S.getOpcode())
15538 for (
Value *V : VL) {
15539 Type *Ty =
V->getType();
15543 R.getORE()->emit([&]() {
15544 std::string TypeStr;
15548 <<
"Cannot SLP vectorize list: type "
15549 << rso.str() +
" is unsupported by vectorizer";
15555 unsigned Sz =
R.getVectorElementSize(I0);
15556 unsigned MinVF =
R.getMinVF(Sz);
15557 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
15558 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
15560 R.getORE()->emit([&]() {
15562 <<
"Cannot SLP vectorize list: vectorization factor "
15563 <<
"less than 2 is not supported";
15568 bool Changed =
false;
15569 bool CandidateFound =
false;
15571 Type *ScalarTy = VL[0]->getType();
15572 if (
auto *IE = dyn_cast<InsertElementInst>(VL[0]))
15573 ScalarTy =
IE->getOperand(1)->getType();
15575 unsigned NextInst = 0, MaxInst = VL.size();
15576 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15583 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
15584 unsigned ActualVF = std::min(MaxInst -
I, VF);
15589 if (MaxVFOnly && ActualVF < MaxVF)
15591 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15597 auto *
I = dyn_cast<Instruction>(V);
15598 return I &&
R.isDeleted(
I);
15602 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
15606 if (
R.isTreeTinyAndNotFullyVectorizable())
15608 R.reorderTopToBottom();
15609 R.reorderBottomToTop(
15610 !isa<InsertElementInst>(Ops.
front()) &&
15611 !
R.doesRootHaveInTreeUses());
15612 R.buildExternalUses();
15614 R.computeMinimumValueSizes();
15615 R.transformNodes();
15617 CandidateFound =
true;
15618 MinCost = std::min(MinCost,
Cost);
15621 <<
" for VF=" << ActualVF <<
"\n");
15625 cast<Instruction>(Ops[0]))
15626 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
15627 <<
" and with tree size "
15628 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
15639 if (!Changed && CandidateFound) {
15640 R.getORE()->emit([&]() {
15642 <<
"List vectorization was possible but not beneficial with cost "
15643 <<
ore::NV(
"Cost", MinCost) <<
" >= "
15646 }
else if (!Changed) {
15647 R.getORE()->emit([&]() {
15649 <<
"Cannot SLP vectorize list: vectorization was impossible"
15650 <<
" with available vectorization factors";
15660 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
15666 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
15667 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
15668 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
15675 auto *
A = dyn_cast<BinaryOperator>(Op0);
15676 auto *
B = dyn_cast<BinaryOperator>(Op1);
15678 if (
A &&
B &&
B->hasOneUse()) {
15679 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
15680 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
15681 if (B0 && B0->getParent() ==
P)
15683 if (B1 && B1->getParent() ==
P)
15687 if (
B &&
A &&
A->hasOneUse()) {
15688 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
15689 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
15690 if (A0 && A0->getParent() ==
P)
15692 if (A1 && A1->getParent() ==
P)
15696 if (Candidates.
size() == 1)
15697 return tryToVectorizeList({Op0, Op1},
R);
15700 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
15701 if (!BestCandidate)
15703 return tryToVectorizeList(
15704 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
15738 ReductionOpsListType ReductionOps;
15750 bool IsSupportedHorRdxIdentityOp =
false;
15761 return isa<SelectInst>(
I) &&
15767 if (Kind == RecurKind::None)
15775 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
15779 return I->getFastMathFlags().noNaNs();
15782 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
15785 return I->isAssociative();
15794 return I->getOperand(2);
15795 return I->getOperand(
Index);
15803 case RecurKind::Or:
15809 case RecurKind::And:
15815 case RecurKind::Add:
15816 case RecurKind::Mul:
15817 case RecurKind::Xor:
15818 case RecurKind::FAdd:
15819 case RecurKind::FMul:
15822 case RecurKind::FMax:
15824 case RecurKind::FMin:
15826 case RecurKind::FMaximum:
15828 case RecurKind::FMinimum:
15830 case RecurKind::SMax:
15836 case RecurKind::SMin:
15842 case RecurKind::UMax:
15848 case RecurKind::UMin:
15863 const ReductionOpsListType &ReductionOps) {
15864 bool UseSelect = ReductionOps.size() == 2 ||
15866 (ReductionOps.size() == 1 &&
15867 any_of(ReductionOps.front(), IsaPred<SelectInst>));
15868 assert((!UseSelect || ReductionOps.size() != 2 ||
15869 isa<SelectInst>(ReductionOps[1][0])) &&
15870 "Expected cmp + select pairs for reduction");
15873 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
15887 auto *
I = dyn_cast<Instruction>(V);
15889 return RecurKind::None;
15891 return RecurKind::Add;
15893 return RecurKind::Mul;
15896 return RecurKind::And;
15899 return RecurKind::Or;
15901 return RecurKind::Xor;
15903 return RecurKind::FAdd;
15905 return RecurKind::FMul;
15908 return RecurKind::FMax;
15910 return RecurKind::FMin;
15913 return RecurKind::FMaximum;
15915 return RecurKind::FMinimum;
15921 return RecurKind::SMax;
15923 return RecurKind::SMin;
15925 return RecurKind::UMax;
15927 return RecurKind::UMin;
15929 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
15951 if (!isa<ExtractElementInst>(
RHS) ||
15953 return RecurKind::None;
15955 if (!isa<ExtractElementInst>(
LHS) ||
15957 return RecurKind::None;
15959 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
15960 return RecurKind::None;
15964 return RecurKind::None;
15969 return RecurKind::None;
15972 return RecurKind::SMax;
15975 return RecurKind::SMin;
15978 return RecurKind::UMax;
15981 return RecurKind::UMin;
15984 return RecurKind::None;
15988 static unsigned getFirstOperandIndex(
Instruction *
I) {
15989 return isCmpSelMinMax(
I) ? 1 : 0;
15995 return isCmpSelMinMax(
I) ? 3 : 2;
16001 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
16002 auto *Sel = cast<SelectInst>(
I);
16003 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
16004 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
16006 return I->getParent() == BB;
16010 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
16011 if (IsCmpSelMinMax) {
16014 if (
auto *Sel = dyn_cast<SelectInst>(
I))
16015 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
16016 return I->hasNUses(2);
16020 return I->hasOneUse();
16025 if (isCmpSelMinMax(
I))
16026 ReductionOps.assign(2, ReductionOpsType());
16028 ReductionOps.assign(1, ReductionOpsType());
16033 if (isCmpSelMinMax(
I)) {
16034 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
16035 ReductionOps[1].emplace_back(
I);
16037 ReductionOps[0].emplace_back(
I);
16042 int Sz = Data.size();
16043 auto *
I = dyn_cast<Instruction>(Data.front());
16044 return Sz > 1 ||
isConstant(Data.front()) ||
16055 RdxKind = HorizontalReduction::getRdxKind(Root);
16056 if (!isVectorizable(RdxKind, Root))
16067 if (
auto *Sel = dyn_cast<SelectInst>(Root))
16068 if (!Sel->getCondition()->hasOneUse())
16071 ReductionRoot = Root;
16076 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16085 for (
int I = getFirstOperandIndex(TreeN),
16086 End = getNumberOfOperands(TreeN);
16088 Value *EdgeVal = getRdxOperand(TreeN,
I);
16089 ReducedValsToOps[EdgeVal].push_back(TreeN);
16090 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16093 !hasSameParent(EdgeInst, BB)) {
16094 ExtraArgs.push_back(EdgeVal);
16101 if (!EdgeInst ||
getRdxKind(EdgeInst) != RdxKind ||
16102 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16103 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16104 !isVectorizable(RdxKind, EdgeInst) ||
16105 (
R.isAnalyzedReductionRoot(EdgeInst) &&
16106 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16107 PossibleReducedVals.push_back(EdgeVal);
16110 ReductionOps.push_back(EdgeInst);
16119 PossibleReducedVals;
16120 initReductionOps(Root);
16125 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
16128 auto LIt = LoadsMap.
find(
Ptr);
16129 if (LIt != LoadsMap.
end()) {
16130 for (
LoadInst *RLI : LIt->second) {
16136 for (
LoadInst *RLI : LIt->second) {
16140 DoNotReverseVals.
insert(RLI);
16144 if (LIt->second.size() > 2) {
16146 hash_value(LIt->second.back()->getPointerOperand());
16147 DoNotReverseVals.
insert(LIt->second.back());
16152 LoadKeyUsed.
insert(Key);
16157 while (!Worklist.empty()) {
16162 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16165 if (
Args.size() < 2) {
16166 addReductionOps(TreeN);
16168 if (!
Args.empty()) {
16169 assert(
Args.size() == 1 &&
"Expected only single argument.");
16170 ExtraArgs[TreeN] =
Args.front();
16174 for (
Value *V : PossibleRedVals) {
16178 ++PossibleReducedVals[
Key][
Idx]
16179 .
insert(std::make_pair(V, 0))
16182 Worklist.append(PossibleReductionOps.
rbegin(),
16183 PossibleReductionOps.
rend());
16188 ++PossibleReducedVals[
Key][
Idx]
16189 .
insert(std::make_pair(TreeN, 0))
16193 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
16196 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
16197 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
16199 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
16202 auto RedValsVect = It->second.takeVector();
16204 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
16205 PossibleRedValsVect.
back().append(Data.second, Data.first);
16207 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
16208 return P1.size() > P2.size();
16212 if (isGoodForReduction(Data) ||
16213 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16214 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16216 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16220 NewIdx = ReducedVals.
size();
16223 if (DoNotReverseVals.
contains(Data.front()))
16224 ReducedVals[NewIdx].
append(Data.begin(), Data.end());
16226 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
16228 ReducedVals.
emplace_back().append(Data.rbegin(), Data.rend());
16243 constexpr int ReductionLimit = 4;
16244 constexpr unsigned RegMaxNumber = 4;
16245 constexpr unsigned RedValsMaxNumber = 128;
16249 unsigned NumReducedVals =
16250 std::accumulate(ReducedVals.
begin(), ReducedVals.
end(), 0,
16252 if (!isGoodForReduction(Vals))
16254 return Num + Vals.size();
16256 if (NumReducedVals < ReductionLimit &&
16261 for (ReductionOpsType &RdxOps : ReductionOps)
16262 for (
Value *RdxOp : RdxOps)
16263 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16274 ReducedVals.
size() * ReducedVals.
front().size() + ExtraArgs.size());
16277 ExternallyUsedValues.
reserve(ExtraArgs.size() + 1);
16280 for (
const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16281 assert(Pair.first &&
"DebugLoc must be set.");
16282 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16283 TrackedVals.
try_emplace(Pair.second, Pair.second);
16288 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
16289 assert(isa<SelectInst>(RdxRootInst) &&
16290 "Expected min/max reduction to have select root instruction");
16291 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16292 assert(isa<Instruction>(ScalarCond) &&
16293 "Expected min/max reduction to have compare condition");
16294 return cast<Instruction>(ScalarCond);
16298 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
16299 if (VectorizedTree) {
16302 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16303 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16306 auto It = ReducedValsToOps.
find(Res);
16307 if (It != ReducedValsToOps.
end() &&
16313 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
16319 bool AnyBoolLogicOp =
16321 return isBoolLogicOp(cast<Instruction>(V));
16325 ExternallyUsedValues[ReductionRoot];
16327 ReductionOps.front().size());
16328 for (ReductionOpsType &RdxOps : ReductionOps)
16329 for (
Value *RdxOp : RdxOps) {
16332 IgnoreList.insert(RdxOp);
16337 for (
Value *U : IgnoreList)
16338 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
16339 RdxFMF &= FPMO->getFastMathFlags();
16340 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16345 for (
Value *V : Candidates)
16346 TrackedVals.try_emplace(V, V);
16352 Value *VectorizedTree =
nullptr;
16353 bool CheckForReusedReductionOps =
false;
16355 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
16361 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
16362 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16367 auto *Inst = dyn_cast<Instruction>(RdxVal);
16369 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16370 (S.getOpcode() && !Inst))
16373 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16375 bool ShuffledExtracts =
false;
16377 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16379 InstructionsState NextS =
getSameOpcode(ReducedVals[
I + 1], TLI);
16380 if (NextS.getOpcode() == Instruction::ExtractElement &&
16381 !NextS.isAltShuffle()) {
16383 for (
Value *RV : ReducedVals[
I + 1]) {
16384 Value *RdxVal = TrackedVals.find(RV)->second;
16388 if (
auto *Inst = dyn_cast<Instruction>(RdxVal))
16389 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
16391 CommonCandidates.push_back(RdxVal);
16392 TrackedToOrig.try_emplace(RdxVal, RV);
16397 Candidates.
swap(CommonCandidates);
16398 ShuffledExtracts =
true;
16407 ++VectorizedVals.try_emplace(Candidates.
front(), 0).first->getSecond();
16409 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
16410 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
16411 if (
auto *ResI = dyn_cast<Instruction>(Res))
16412 V.analyzedReductionRoot(ResI);
16414 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16418 unsigned NumReducedVals = Candidates.
size();
16419 if (NumReducedVals < ReductionLimit &&
16426 IsSupportedHorRdxIdentityOp =
16428 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16431 if (IsSupportedHorRdxIdentityOp)
16432 for (
Value *V : Candidates)
16433 ++SameValuesCounter.
insert(std::make_pair(V, 0)).first->second;
16444 bool SameScaleFactor =
false;
16445 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16446 SameValuesCounter.
size() != Candidates.size();
16447 if (OptReusedScalars) {
16449 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16450 RdxKind == RecurKind::Xor) &&
16452 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
16453 return P.second == SameValuesCounter.
front().second;
16455 Candidates.resize(SameValuesCounter.
size());
16456 transform(SameValuesCounter, Candidates.begin(),
16457 [](
const auto &
P) { return P.first; });
16458 NumReducedVals = Candidates.size();
16460 if (NumReducedVals == 1) {
16461 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
16462 unsigned Cnt = SameValuesCounter.
lookup(OrigV);
16464 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
16465 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16466 VectorizedVals.try_emplace(OrigV, Cnt);
16471 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
16472 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
16476 unsigned ReduxWidth = std::min<unsigned>(
16478 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
16479 RegMaxNumber * RedValsMaxNumber));
16480 unsigned Start = 0;
16481 unsigned Pos = Start;
16483 unsigned PrevReduxWidth = ReduxWidth;
16484 bool CheckForReusedReductionOpsLocal =
false;
16485 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16486 &CheckForReusedReductionOpsLocal,
16487 &PrevReduxWidth, &
V,
16488 &IgnoreList](
bool IgnoreVL =
false) {
16489 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
16490 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16493 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16496 if (Pos < NumReducedVals - ReduxWidth + 1)
16497 return IsAnyRedOpGathered;
16500 return IsAnyRedOpGathered;
16502 bool AnyVectorized =
false;
16503 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16504 ReduxWidth >= ReductionLimit) {
16507 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16509 CheckForReusedReductionOps =
true;
16512 PrevReduxWidth = ReduxWidth;
16515 if (
V.areAnalyzedReductionVals(VL)) {
16516 (void)AdjustReducedVals(
true);
16522 auto *RedValI = dyn_cast<Instruction>(RedVal);
16525 return V.isDeleted(RedValI);
16528 V.buildTree(VL, IgnoreList);
16529 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
16530 if (!AdjustReducedVals())
16531 V.analyzedReductionVals(VL);
16534 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
16535 if (!AdjustReducedVals())
16536 V.analyzedReductionVals(VL);
16539 V.reorderTopToBottom();
16541 V.reorderBottomToTop(
true);
16545 ExternallyUsedValues);
16546 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
16547 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
16549 for (
Value *V : ReducedVals[Cnt])
16550 if (isa<Instruction>(V))
16551 LocalExternallyUsedValues[TrackedVals[
V]];
16553 if (!IsSupportedHorRdxIdentityOp) {
16556 "Reused values counter map is not empty");
16557 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16558 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16560 Value *
V = Candidates[Cnt];
16561 Value *OrigV = TrackedToOrig.find(V)->second;
16562 ++SameValuesCounter[OrigV];
16568 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16569 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16571 Value *RdxVal = Candidates[Cnt];
16572 if (!Visited.
insert(RdxVal).second)
16576 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
16577 LocalExternallyUsedValues[RdxVal];
16580 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16582 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
16583 if (NumOps != ReducedValsToOps.
find(OrigV)->second.size())
16584 LocalExternallyUsedValues[RdxVal];
16587 if (!IsSupportedHorRdxIdentityOp)
16588 SameValuesCounter.
clear();
16589 for (
Value *RdxVal : VL)
16590 if (RequiredExtract.
contains(RdxVal))
16591 LocalExternallyUsedValues[RdxVal];
16595 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16596 ReplacementToExternal.
try_emplace(Pair.second, Pair.first);
16597 for (
const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16599 auto RIt = ReplacementToExternal.
find(Ext);
16600 while (RIt != ReplacementToExternal.
end()) {
16602 RIt = ReplacementToExternal.
find(Ext);
16604 auto *It = ExternallyUsedValues.
find(Ext);
16605 if (It == ExternallyUsedValues.
end())
16607 LocalExternallyUsedValues[Pair.second].append(It->second);
16609 V.buildExternalUses(LocalExternallyUsedValues);
16611 V.computeMinimumValueSizes();
16612 V.transformNodes();
16617 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
16620 <<
" for reduction\n");
16624 V.getORE()->emit([&]() {
16626 SV_NAME,
"HorSLPNotBeneficial",
16627 ReducedValsToOps.
find(VL[0])->second.front())
16628 <<
"Vectorizing horizontal reduction is possible "
16629 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
16630 <<
" and threshold "
16633 if (!AdjustReducedVals())
16634 V.analyzedReductionVals(VL);
16638 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
16639 <<
Cost <<
". (HorRdx)\n");
16640 V.getORE()->emit([&]() {
16642 SV_NAME,
"VectorizedHorizontalReduction",
16643 ReducedValsToOps.
find(VL[0])->second.front())
16644 <<
"Vectorized horizontal reduction with cost "
16645 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
16646 <<
ore::NV(
"TreeSize",
V.getTreeSize());
16653 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
16655 if (IsCmpSelMinMax)
16656 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16659 Value *VectorizedRoot =
V.vectorizeTree(LocalExternallyUsedValues,
16660 ReplacedExternals, InsertPt);
16667 if ((isBoolLogicOp(RdxRootInst) ||
16668 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16670 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
16673 if (OptReusedScalars && !SameScaleFactor) {
16675 emitReusedOps(VectorizedRoot, Builder,
V.getRootNodeScalars(),
16676 SameValuesCounter, TrackedToOrig);
16679 Value *ReducedSubTree =
16680 emitReduction(VectorizedRoot, Builder, ReduxWidth,
TTI);
16681 if (ReducedSubTree->
getType() != VL.front()->getType()) {
16683 ReducedSubTree, VL.front()->getType(),
any_of(VL, [&](
Value *R) {
16685 R, cast<Instruction>(ReductionOps.front().front())
16687 ->getDataLayout());
16695 if (OptReusedScalars && SameScaleFactor)
16696 ReducedSubTree = emitScaleForReusedOps(
16697 ReducedSubTree, Builder, SameValuesCounter.
front().second);
16699 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16701 for (
Value *RdxVal : VL) {
16702 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
16703 if (IsSupportedHorRdxIdentityOp) {
16704 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
16707 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
16708 if (!
V.isVectorized(RdxVal))
16709 RequiredExtract.
insert(RdxVal);
16714 AnyVectorized =
true;
16716 if (OptReusedScalars && !AnyVectorized) {
16717 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
16718 Value *RedVal = emitScaleForReusedOps(
P.first, Builder,
P.second);
16719 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16720 Value *OrigV = TrackedToOrig.find(
P.first)->second;
16721 VectorizedVals.try_emplace(OrigV,
P.second);
16726 if (VectorizedTree) {
16747 if (!AnyBoolLogicOp)
16749 if (isBoolLogicOp(RedOp1) &&
16750 ((!InitStep &&
LHS == VectorizedTree) ||
16753 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
16754 getRdxOperand(RedOp2, 0) ==
RHS ||
16759 if (
LHS != VectorizedTree)
16770 unsigned Sz = InstVals.
size();
16773 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
16776 Value *RdxVal1 = InstVals[
I].second;
16777 Value *StableRdxVal1 = RdxVal1;
16778 auto It1 = TrackedVals.find(RdxVal1);
16779 if (It1 != TrackedVals.end())
16780 StableRdxVal1 = It1->second;
16781 Value *RdxVal2 = InstVals[
I + 1].second;
16782 Value *StableRdxVal2 = RdxVal2;
16783 auto It2 = TrackedVals.find(RdxVal2);
16784 if (It2 != TrackedVals.end())
16785 StableRdxVal2 = It2->second;
16789 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
16791 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
16792 StableRdxVal2,
"op.rdx", ReductionOps);
16793 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
16796 ExtraReds[Sz / 2] = InstVals.
back();
16800 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
16804 for (
Value *RdxVal : Candidates) {
16805 if (!Visited.
insert(RdxVal).second)
16807 unsigned NumOps = VectorizedVals.lookup(RdxVal);
16814 for (
auto &Pair : ExternallyUsedValues) {
16816 for (
auto *
I : Pair.second)
16820 bool InitStep =
true;
16821 while (ExtraReductions.
size() > 1) {
16822 VectorizedTree = ExtraReductions.
front().second;
16824 FinalGen(ExtraReductions, InitStep);
16825 ExtraReductions.
swap(NewReds);
16828 VectorizedTree = ExtraReductions.
front().second;
16830 ReductionRoot->replaceAllUsesWith(VectorizedTree);
16839 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
16846 for (
auto *U :
Ignore->users()) {
16848 "All users must be either in the reduction ops list.");
16851 if (!
Ignore->use_empty()) {
16853 Ignore->replaceAllUsesWith(Undef);
16855 V.eraseInstruction(cast<Instruction>(
Ignore));
16858 }
else if (!CheckForReusedReductionOps) {
16859 for (ReductionOpsType &RdxOps : ReductionOps)
16860 for (
Value *RdxOp : RdxOps)
16861 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16863 return VectorizedTree;
16870 bool IsCmpSelMinMax,
unsigned ReduxWidth,
16873 Type *ScalarTy = ReducedVals.
front()->getType();
16882 int Cnt = ReducedVals.
size();
16883 for (
Value *RdxVal : ReducedVals) {
16888 Cost += GenCostFn();
16893 auto *RdxOp = cast<Instruction>(U);
16894 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
16902 Cost += ScalarCost;
16904 Cost += GenCostFn();
16909 case RecurKind::Add:
16910 case RecurKind::Mul:
16911 case RecurKind::Or:
16912 case RecurKind::And:
16913 case RecurKind::Xor:
16914 case RecurKind::FAdd:
16915 case RecurKind::FMul: {
16920 ScalarCost = EvaluateScalarCost([&]() {
16925 case RecurKind::FMax:
16926 case RecurKind::FMin:
16927 case RecurKind::FMaximum:
16928 case RecurKind::FMinimum:
16929 case RecurKind::SMax:
16930 case RecurKind::SMin:
16931 case RecurKind::UMax:
16932 case RecurKind::UMin: {
16936 ScalarCost = EvaluateScalarCost([&]() {
16946 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
16948 <<
" (It is a splitting reduction)\n");
16949 return VectorCost - ScalarCost;
16955 assert(VectorizedValue &&
"Need to have a vectorized tree node");
16957 "We only handle power-of-two reductions for now");
16958 assert(RdxKind != RecurKind::FMulAdd &&
16959 "A call to the llvm.fmuladd intrinsic is not handled yet");
16961 ++NumVectorInstructions;
16968 assert(IsSupportedHorRdxIdentityOp &&
16969 "The optimization of matched scalar identity horizontal reductions "
16970 "must be supported.");
16972 case RecurKind::Add: {
16974 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
16976 << VectorizedValue <<
". (HorRdx)\n");
16977 return Builder.
CreateMul(VectorizedValue, Scale);
16979 case RecurKind::Xor: {
16981 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
16982 <<
". (HorRdx)\n");
16985 return VectorizedValue;
16987 case RecurKind::FAdd: {
16989 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
16991 << VectorizedValue <<
". (HorRdx)\n");
16992 return Builder.
CreateFMul(VectorizedValue, Scale);
16994 case RecurKind::And:
16995 case RecurKind::Or:
16996 case RecurKind::SMax:
16997 case RecurKind::SMin:
16998 case RecurKind::UMax:
16999 case RecurKind::UMin:
17000 case RecurKind::FMax:
17001 case RecurKind::FMin:
17002 case RecurKind::FMaximum:
17003 case RecurKind::FMinimum:
17005 return VectorizedValue;
17006 case RecurKind::Mul:
17007 case RecurKind::FMul:
17008 case RecurKind::FMulAdd:
17009 case RecurKind::IAnyOf:
17010 case RecurKind::FAnyOf:
17011 case RecurKind::None:
17023 assert(IsSupportedHorRdxIdentityOp &&
17024 "The optimization of matched scalar identity horizontal reductions "
17025 "must be supported.");
17026 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
17027 if (VTy->getElementType() != VL.
front()->getType()) {
17033 R, cast<Instruction>(ReductionOps.front().front())
17035 ->getDataLayout());
17040 case RecurKind::Add: {
17043 for (
Value *V : VL) {
17044 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17045 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
17049 << VectorizedValue <<
". (HorRdx)\n");
17050 return Builder.
CreateMul(VectorizedValue, Scale);
17052 case RecurKind::And:
17053 case RecurKind::Or:
17056 <<
". (HorRdx)\n");
17057 return VectorizedValue;
17058 case RecurKind::SMax:
17059 case RecurKind::SMin:
17060 case RecurKind::UMax:
17061 case RecurKind::UMin:
17062 case RecurKind::FMax:
17063 case RecurKind::FMin:
17064 case RecurKind::FMaximum:
17065 case RecurKind::FMinimum:
17068 <<
". (HorRdx)\n");
17069 return VectorizedValue;
17070 case RecurKind::Xor: {
17076 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
17078 std::iota(
Mask.begin(),
Mask.end(), 0);
17079 bool NeedShuffle =
false;
17080 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
17082 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17083 if (Cnt % 2 == 0) {
17085 NeedShuffle =
true;
17091 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
17095 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
17096 return VectorizedValue;
17098 case RecurKind::FAdd: {
17101 for (
Value *V : VL) {
17102 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
find(V)->second);
17103 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
17106 return Builder.
CreateFMul(VectorizedValue, Scale);
17108 case RecurKind::Mul:
17109 case RecurKind::FMul:
17110 case RecurKind::FMulAdd:
17111 case RecurKind::IAnyOf:
17112 case RecurKind::FAnyOf:
17113 case RecurKind::None:
17123 return HorizontalReduction::getRdxKind(V);
17126 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17127 return cast<FixedVectorType>(IE->getType())->getNumElements();
17129 unsigned AggregateSize = 1;
17130 auto *
IV = cast<InsertValueInst>(InsertInst);
17131 Type *CurrentType =
IV->getType();
17133 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
17134 for (
auto *Elt : ST->elements())
17135 if (Elt != ST->getElementType(0))
17136 return std::nullopt;
17137 AggregateSize *= ST->getNumElements();
17138 CurrentType = ST->getElementType(0);
17139 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17140 AggregateSize *= AT->getNumElements();
17141 CurrentType = AT->getElementType();
17142 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17143 AggregateSize *= VT->getNumElements();
17144 return AggregateSize;
17146 return AggregateSize;
17148 return std::nullopt;
17157 unsigned OperandOffset) {
17160 std::optional<unsigned> OperandIndex =
17164 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17166 BuildVectorOpds, InsertElts, *OperandIndex);
17169 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17170 InsertElts[*OperandIndex] = LastInsertInst;
17172 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
17173 }
while (LastInsertInst !=
nullptr &&
17174 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17197 assert((isa<InsertElementInst>(LastInsertInst) ||
17198 isa<InsertValueInst>(LastInsertInst)) &&
17199 "Expected insertelement or insertvalue instruction!");
17202 "Expected empty result vectors!");
17205 if (!AggregateSize)
17207 BuildVectorOpds.
resize(*AggregateSize);
17208 InsertElts.
resize(*AggregateSize);
17213 if (BuildVectorOpds.
size() >= 2)
17231 auto DominatedReduxValue = [&](
Value *R) {
17232 return isa<Instruction>(R) &&
17233 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
17239 if (
P->getIncomingBlock(0) == ParentBB) {
17240 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17241 }
else if (
P->getIncomingBlock(1) == ParentBB) {
17242 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17245 if (Rdx && DominatedReduxValue(Rdx))
17258 if (
P->getIncomingBlock(0) == BBLatch) {
17259 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
17260 }
else if (
P->getIncomingBlock(1) == BBLatch) {
17261 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
17264 if (Rdx && DominatedReduxValue(Rdx))
17298 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17299 isa<IntrinsicInst>(Root)) &&
17300 "Expected binop, select, or intrinsic for reduction matching");
17302 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17304 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17306 return dyn_cast<Instruction>(
RHS);
17308 return dyn_cast<Instruction>(
LHS);
17315 Value *Op0 =
nullptr;
17316 Value *Op1 =
nullptr;
17319 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17325 Value *B0 =
nullptr, *B1 =
nullptr;
17330bool SLPVectorizerPass::vectorizeHorReduction(
17335 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
17337 if (Root->
getParent() != BB || isa<PHINode>(Root))
17341 auto SelectRoot = [&]() {
17360 std::queue<std::pair<Instruction *, unsigned>>
Stack;
17361 Stack.emplace(SelectRoot(), 0);
17365 if (
R.isAnalyzedReductionRoot(Inst))
17370 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
17372 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
17374 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
17375 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17382 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
17387 while (!
Stack.empty()) {
17390 std::tie(Inst, Level) =
Stack.front();
17395 if (
R.isDeleted(Inst))
17397 if (
Value *VectorizedV = TryToReduce(Inst)) {
17399 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
17401 Stack.emplace(
I, Level);
17406 if (!TryAppendToPostponedInsts(Inst)) {
17417 if (VisitedInstrs.
insert(
Op).second)
17418 if (
auto *
I = dyn_cast<Instruction>(
Op))
17421 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
17422 !
R.isDeleted(
I) &&
I->getParent() == BB)
17423 Stack.emplace(
I, Level);
17432 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
17433 Res |= tryToVectorize(PostponedInsts, R);
17440 for (
Value *V : Insts)
17441 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
17442 Res |= tryToVectorize(Inst, R);
17446bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
17448 if (!
R.canMapToVector(IVI->
getType()))
17456 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
17458 return tryToVectorizeList(BuildVectorOpds, R);
17467 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
17471 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
17472 return tryToVectorizeList(BuildVectorInsts, R);
17475template <
typename T>
17480 bool MaxVFOnly,
BoUpSLP &R) {
17481 bool Changed =
false;
17490 auto *SameTypeIt = IncIt;
17491 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17495 unsigned NumElts = (SameTypeIt - IncIt);
17496 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
17497 << NumElts <<
")\n");
17508 TryToVectorizeHelper(
ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17514 auto GetMinNumElements = [&R](
Value *V) {
17515 unsigned EltSize = R.getVectorElementSize(V);
17516 return std::max(2U, R.getMaxVecRegSize() / EltSize);
17518 if (NumElts < GetMinNumElements(*IncIt) &&
17519 (Candidates.
empty() ||
17520 Candidates.
front()->getType() == (*IncIt)->getType())) {
17521 Candidates.
append(IncIt, std::next(IncIt, NumElts));
17525 if (Candidates.
size() > 1 &&
17526 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17527 if (TryToVectorizeHelper(Candidates,
false)) {
17530 }
else if (MaxVFOnly) {
17532 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end();
17534 auto *SameTypeIt = It;
17535 while (SameTypeIt !=
End && AreCompatible(*SameTypeIt, *It))
17537 unsigned NumElts = (SameTypeIt - It);
17538 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(It, NumElts),
17544 Candidates.
clear();
17548 IncIt = SameTypeIt;
17560template <
bool IsCompatibility>
17565 "Expected valid element types only.");
17567 return IsCompatibility;
17568 auto *CI1 = cast<CmpInst>(V);
17569 auto *CI2 = cast<CmpInst>(V2);
17570 if (CI1->getOperand(0)->getType()->getTypeID() <
17572 return !IsCompatibility;
17573 if (CI1->getOperand(0)->getType()->getTypeID() >
17582 if (BasePred1 < BasePred2)
17583 return !IsCompatibility;
17584 if (BasePred1 > BasePred2)
17587 bool CI1Preds = Pred1 == BasePred1;
17588 bool CI2Preds = Pred2 == BasePred1;
17589 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
17590 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
17591 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
17595 return !IsCompatibility;
17598 if (
auto *I1 = dyn_cast<Instruction>(Op1))
17599 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
17600 if (IsCompatibility) {
17601 if (I1->getParent() != I2->getParent())
17608 return NodeI2 !=
nullptr;
17611 assert((NodeI1 == NodeI2) ==
17613 "Different nodes should have different DFS numbers");
17614 if (NodeI1 != NodeI2)
17618 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17620 if (IsCompatibility)
17622 if (I1->getOpcode() != I2->getOpcode())
17623 return I1->getOpcode() < I2->getOpcode();
17626 return IsCompatibility;
17629template <
typename ItT>
17632 bool Changed =
false;
17635 if (
R.isDeleted(
I))
17638 if (
auto *RootOp = dyn_cast<Instruction>(
Op))
17639 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
17643 if (
R.isDeleted(
I))
17645 Changed |= tryToVectorize(
I, R);
17652 return compareCmp<false>(V, V2, *TLI, *DT);
17655 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
17658 return compareCmp<true>(V1, V2, *TLI, *DT);
17665 if (Vals.
size() <= 1)
17667 Changed |= tryToVectorizeSequence<Value>(
17668 Vals, CompareSorter, AreCompatibleCompares,
17671 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
17673 auto *Select = dyn_cast<SelectInst>(U);
17675 Select->getParent() != cast<Instruction>(V)->getParent();
17678 if (ArePossiblyReducedInOtherBlock)
17680 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17686bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17688 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17689 "This function only accepts Insert instructions");
17690 bool OpsChanged =
false;
17693 for (
auto *
I :
reverse(Instructions)) {
17694 if (
R.isDeleted(
I))
17696 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
17699 for (
auto *
I :
reverse(Instructions)) {
17700 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
17702 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
17703 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
17704 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
17705 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
17709 OpsChanged |= tryToVectorize(PostponedInsts, R);
17716 bool Changed =
false;
17723 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
17726 "Expected vectorizable types only.");
17735 if (Opcodes1.
size() < Opcodes2.
size())
17737 if (Opcodes1.
size() > Opcodes2.
size())
17739 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
17742 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
17743 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
17748 return NodeI2 !=
nullptr;
17751 assert((NodeI1 == NodeI2) ==
17753 "Different nodes should have different DFS numbers");
17754 if (NodeI1 != NodeI2)
17757 if (S.getOpcode() && !S.isAltShuffle())
17759 return I1->getOpcode() < I2->getOpcode();
17768 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
17769 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
17777 bool U1 = isa<UndefValue>(Opcodes1[
I]);
17778 bool U2 = isa<UndefValue>(Opcodes2[
I]);
17782 auto ValID1 = Opcodes1[
I]->getValueID();
17783 auto ValID2 = Opcodes2[
I]->getValueID();
17784 if (ValID1 == ValID2)
17786 if (ValID1 < ValID2)
17788 if (ValID1 > ValID2)
17797 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
17802 auto AreCompatiblePHIs = [&PHIToOpcodes,
this](
Value *V1,
Value *
V2) {
17805 if (V1->getType() !=
V2->getType())
17809 if (Opcodes1.
size() != Opcodes2.
size())
17811 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
17813 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
17815 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
17816 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
17817 if (
I1->getParent() != I2->getParent())
17824 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
17826 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
17832 bool HaveVectorizedPhiNodes =
false;
17843 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
17856 if (!Opcodes.
empty())
17860 while (!Nodes.empty()) {
17861 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
17864 for (
Value *V :
PHI->incoming_values()) {
17865 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
17866 Nodes.push_back(PHI1);
17874 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
17875 Incoming, PHICompare, AreCompatiblePHIs,
17877 return tryToVectorizeList(Candidates, R, MaxVFOnly);
17880 Changed |= HaveVectorizedPhiNodes;
17882 }
while (HaveVectorizedPhiNodes);
17884 VisitedInstrs.
clear();
17886 InstSetVector PostProcessInserts;
17890 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
17891 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
17892 if (VectorizeCmps) {
17893 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
17894 PostProcessCmps.
clear();
17896 PostProcessInserts.clear();
17901 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
17902 return PostProcessCmps.
contains(Cmp);
17903 return isa<InsertElementInst, InsertValueInst>(
I) &&
17904 PostProcessInserts.contains(
I);
17910 return I->use_empty() &&
17911 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
17916 if (isa<ScalableVectorType>(It->getType()))
17920 if (
R.isDeleted(&*It))
17923 if (!VisitedInstrs.
insert(&*It).second) {
17924 if (HasNoUsers(&*It) &&
17925 VectorizeInsertsAndCmps(It->isTerminator())) {
17935 if (isa<DbgInfoIntrinsic>(It))
17939 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
17941 if (
P->getNumIncomingValues() == 2) {
17944 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
17953 for (
unsigned I = 0, E =
P->getNumIncomingValues();
I != E;
I++) {
17958 if (BB ==
P->getIncomingBlock(
I) ||
17959 !DT->isReachableFromEntry(
P->getIncomingBlock(
I)))
17964 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
17965 PI && !IsInPostProcessInstrs(PI))
17966 Changed |= vectorizeRootInstruction(
nullptr, PI,
17967 P->getIncomingBlock(
I), R,
TTI);
17972 if (HasNoUsers(&*It)) {
17973 bool OpsChanged =
false;
17974 auto *
SI = dyn_cast<StoreInst>(It);
17984 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
17985 SI->getValueOperand()->hasOneUse();
17987 if (TryToVectorizeRoot) {
17988 for (
auto *V : It->operand_values()) {
17991 if (
auto *VI = dyn_cast<Instruction>(V);
17992 VI && !IsInPostProcessInstrs(VI))
17994 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
18001 VectorizeInsertsAndCmps(It->isTerminator());
18012 if (isa<InsertElementInst, InsertValueInst>(It))
18013 PostProcessInserts.insert(&*It);
18014 else if (isa<CmpInst>(It))
18015 PostProcessCmps.
insert(cast<CmpInst>(&*It));
18022 auto Changed =
false;
18023 for (
auto &Entry : GEPs) {
18026 if (Entry.second.size() < 2)
18029 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
18030 << Entry.second.size() <<
".\n");
18037 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18038 unsigned EltSize =
R.getVectorElementSize(*Entry.second[0]->idx_begin());
18039 if (MaxVecRegSize < EltSize)
18042 unsigned MaxElts = MaxVecRegSize / EltSize;
18043 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18044 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18057 Candidates.remove_if([&R](
Value *
I) {
18058 return R.isDeleted(cast<Instruction>(
I)) ||
18059 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
18067 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
18068 auto *GEPI = GEPList[
I];
18069 if (!Candidates.count(GEPI))
18071 auto *SCEVI = SE->
getSCEV(GEPList[
I]);
18072 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
18073 auto *GEPJ = GEPList[J];
18074 auto *SCEVJ = SE->
getSCEV(GEPList[J]);
18075 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
18076 Candidates.remove(GEPI);
18077 Candidates.remove(GEPJ);
18078 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18079 Candidates.remove(GEPJ);
18086 if (Candidates.
size() < 2)
18093 auto BundleIndex = 0
u;
18094 for (
auto *V : Candidates) {
18095 auto *
GEP = cast<GetElementPtrInst>(V);
18096 auto *GEPIdx =
GEP->idx_begin()->get();
18097 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18098 Bundle[BundleIndex++] = GEPIdx;
18110 Changed |= tryToVectorizeList(Bundle, R);
18116bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
18117 bool Changed =
false;
18122 if (
V->getValueOperand()->getType()->getTypeID() <
18123 V2->getValueOperand()->getType()->getTypeID())
18125 if (
V->getValueOperand()->getType()->getTypeID() >
18126 V2->getValueOperand()->getType()->getTypeID())
18128 if (
V->getPointerOperandType()->getTypeID() <
18129 V2->getPointerOperandType()->getTypeID())
18131 if (
V->getPointerOperandType()->getTypeID() >
18132 V2->getPointerOperandType()->getTypeID())
18135 if (isa<UndefValue>(
V->getValueOperand()) ||
18136 isa<UndefValue>(
V2->getValueOperand()))
18138 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
18139 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18141 DT->getNode(
I1->getParent());
18143 DT->getNode(I2->getParent());
18144 assert(NodeI1 &&
"Should only process reachable instructions");
18145 assert(NodeI2 &&
"Should only process reachable instructions");
18146 assert((NodeI1 == NodeI2) ==
18148 "Different nodes should have different DFS numbers");
18149 if (NodeI1 != NodeI2)
18154 return I1->getOpcode() < I2->getOpcode();
18156 if (isa<Constant>(
V->getValueOperand()) &&
18157 isa<Constant>(
V2->getValueOperand()))
18159 return V->getValueOperand()->getValueID() <
18160 V2->getValueOperand()->getValueID();
18172 isa<UndefValue>(
V2->getValueOperand()))
18175 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
18176 if (
I1->getParent() != I2->getParent())
18179 return S.getOpcode() > 0;
18182 isa<Constant>(
V2->getValueOperand()))
18185 V2->getValueOperand()->getValueID();
18189 for (
auto &Pair : Stores) {
18190 if (Pair.second.size() < 2)
18194 << Pair.second.size() <<
".\n");
18203 Pair.second.rend());
18204 Changed |= tryToVectorizeSequence<StoreInst>(
18205 ReversedStores, StoreSorter, AreCompatibleStores,
18207 return vectorizeStores(Candidates, R);
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
value_type & FindAndConstruct(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
const BasicBlock * getParent() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Value * getOperand(unsigned i) const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
unsigned getTreeSize() const
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isLoadCombineCandidate() const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
testing::Matcher< const detail::ErrorHolder & > Failed()
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const