73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
111STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 "Controls which SLP graphs should be vectorized.");
118 cl::desc(
"Run the SLP vectorization passes"));
122 cl::desc(
"Enable vectorization for wider vector utilization"));
126 cl::desc(
"Only vectorize if you gain more than this "
131 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
136 cl::desc(
"Attempt to vectorize horizontal reductions"));
141 "Attempt to vectorize horizontal reductions feeding into a store"));
145 cl::desc(
"Attempt to vectorize for this register size in bits"));
149 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
157 cl::desc(
"Limit the size of the SLP scheduling region per block"));
161 cl::desc(
"Attempt to vectorize for this register size in bits"));
165 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
169 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
175 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
184 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
188 cl::desc(
"The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
193 cl::desc(
"The maximum stride, considered to be profitable."));
197 cl::desc(
"Display the SLP trees with Graphviz"));
201 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
232 if (
SLPReVec && isa<FixedVectorType>(Ty))
234 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 if (
auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (
auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (
auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
271 Type *Ty,
unsigned Sz) {
276 if (NumParts == 0 || NumParts >= Sz)
291 if (NumParts == 0 || NumParts >= Sz)
296 return (Sz / RegVF) * RegVF;
306 for (
unsigned I : seq<unsigned>(Mask.size()))
308 I * VecTyNumElements, VecTyNumElements)))
310 : Mask[
I] * VecTyNumElements + J;
341 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
343 auto *SV = cast<ShuffleVectorInst>(VL.
front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
352 unsigned NumGroup = 0;
353 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
355 Value *Src = SV->getOperand(0);
359 auto *SV = cast<ShuffleVectorInst>(V);
361 if (SV->getOperand(0) != Src)
364 if (!SV->isExtractSubvectorMask(Index))
366 ExpectedIndex.
set(Index / ShuffleMaskSize);
370 if (!ExpectedIndex.
all())
374 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
392 auto *SV = cast<ShuffleVectorInst>(VL.
front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 unsigned AccumulateLength = 0;
397 for (
Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (
int M : SV->getShuffleMask())
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
420 auto *
I = dyn_cast<Instruction>(V);
421 if (!
I || isa<ExtractValueInst>(
I))
423 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
425 if (isa<ExtractElementInst>(
I))
427 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
443 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
452 OS <<
"Idx: " <<
Idx <<
", ";
453 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
461 auto *It =
find_if(VL, IsaPred<Instruction>);
470 if (isa<PoisonValue>(V))
472 auto *
II = dyn_cast<Instruction>(V);
476 if (BB !=
II->getParent())
493 Value *FirstNonUndef =
nullptr;
494 for (
Value *V : VL) {
495 if (isa<UndefValue>(V))
497 if (!FirstNonUndef) {
501 if (V != FirstNonUndef)
504 return FirstNonUndef !=
nullptr;
509 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
510 return Cmp->isCommutative();
511 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
532 (BO->getOpcode() == Instruction::FSub &&
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 return I->isCommutative();
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
548 if (
const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
555 if (CI->getValue().uge(VT->getNumElements()))
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
569 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
571 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
576 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
580 Type *CurrentType =
IV->getType();
581 for (
unsigned I :
IV->indices()) {
582 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(
I);
585 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
620 if (MaskArg == UseMask::UndefsAsMask)
624 if (MaskArg == UseMask::FirstArg &&
Value < VF)
625 UseMask.reset(
Value);
626 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
627 UseMask.reset(
Value - VF);
635template <
bool IsPoisonOnly = false>
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
642 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
645 auto *
C = dyn_cast<Constant>(V);
647 if (!UseMask.empty()) {
649 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
651 if (isa<T>(
II->getOperand(1)))
658 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
666 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
673 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
674 if (
Constant *Elem =
C->getAggregateElement(
I))
676 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
704static std::optional<TargetTransformInfo::ShuffleKind>
707 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
711 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
712 auto *EI = dyn_cast<ExtractElementInst>(V);
715 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
718 return std::max(S, VTy->getNumElements());
721 Value *Vec1 =
nullptr;
722 Value *Vec2 =
nullptr;
724 auto *EE = dyn_cast<ExtractElementInst>(V);
727 Value *Vec = EE->getVectorOperand();
728 if (isa<UndefValue>(Vec))
733 ShuffleMode CommonShuffleMode =
Unknown;
735 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
737 if (isa<UndefValue>(VL[
I]))
739 auto *EI = cast<ExtractElementInst>(VL[
I]);
740 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 auto *Vec = EI->getVectorOperand();
744 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
747 if (isa<UndefValue>(Vec)) {
750 if (isa<UndefValue>(EI->getIndexOperand()))
752 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
758 unsigned IntIdx =
Idx->getValue().getZExtValue();
765 if (!Vec1 || Vec1 == Vec) {
767 }
else if (!Vec2 || Vec2 == Vec) {
773 if (CommonShuffleMode == Permute)
777 if (Mask[
I] %
Size !=
I) {
778 CommonShuffleMode = Permute;
781 CommonShuffleMode =
Select;
784 if (CommonShuffleMode ==
Select && Vec2)
795 assert((Opcode == Instruction::ExtractElement ||
796 Opcode == Instruction::ExtractValue) &&
797 "Expected extractelement or extractvalue instruction.");
798 if (Opcode == Instruction::ExtractElement) {
799 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
802 return CI->getZExtValue();
804 auto *EI = cast<ExtractValueInst>(E);
805 if (EI->getNumIndices() != 1)
807 return *EI->idx_begin();
813class InstructionsState {
828 unsigned getAltOpcode()
const {
833 bool isAltShuffle()
const {
return AltOp != MainOp; }
836 unsigned CheckedOpcode =
I->getOpcode();
837 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
840 InstructionsState() =
delete;
842 : MainOp(MainOp), AltOp(AltOp) {}
843 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
869 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
870 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
871 BaseOp0 == Op0 || BaseOp1 == Op1 ||
882 "Assessing comparisons of different types?");
892 return (BasePred == Pred &&
894 (BasePred == SwappedPred &&
904 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
905 return InstructionsState::invalid();
907 auto *It =
find_if(VL, IsaPred<Instruction>);
909 return InstructionsState::invalid();
912 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
913 if ((VL.
size() > 2 && !isa<PHINode>(V) && InstCnt < VL.
size() / 2) ||
914 (VL.
size() == 2 && InstCnt < 2))
915 return InstructionsState::invalid();
917 bool IsCastOp = isa<CastInst>(V);
918 bool IsBinOp = isa<BinaryOperator>(V);
919 bool IsCmpOp = isa<CmpInst>(V);
922 unsigned Opcode = cast<Instruction>(V)->getOpcode();
923 unsigned AltOpcode = Opcode;
924 unsigned AltIndex = std::distance(VL.
begin(), It);
926 bool SwappedPredsCompatible = [&]() {
930 UniquePreds.
insert(BasePred);
931 UniqueNonSwappedPreds.
insert(BasePred);
932 for (
Value *V : VL) {
933 auto *
I = dyn_cast<CmpInst>(V);
939 UniqueNonSwappedPreds.
insert(CurrentPred);
940 if (!UniquePreds.
contains(CurrentPred) &&
941 !UniquePreds.
contains(SwappedCurrentPred))
942 UniquePreds.
insert(CurrentPred);
947 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
951 auto *IBase = cast<Instruction>(V);
954 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
958 return InstructionsState::invalid();
960 bool AnyPoison = InstCnt != VL.
size();
961 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
962 auto *
I = dyn_cast<Instruction>(VL[Cnt]);
969 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
970 return InstructionsState::invalid();
971 unsigned InstOpcode =
I->getOpcode();
972 if (IsBinOp && isa<BinaryOperator>(
I)) {
973 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
977 AltOpcode = InstOpcode;
981 }
else if (IsCastOp && isa<CastInst>(
I)) {
982 Value *Op0 = IBase->getOperand(0);
984 Value *Op1 =
I->getOperand(0);
987 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
989 if (Opcode == AltOpcode) {
992 "Cast isn't safe for alternation, logic needs to be updated!");
993 AltOpcode = InstOpcode;
998 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
999 auto *BaseInst = cast<CmpInst>(V);
1000 Type *Ty0 = BaseInst->getOperand(0)->getType();
1001 Type *Ty1 = Inst->getOperand(0)->getType();
1003 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1004 assert(InstOpcode == AltOpcode &&
1005 "Alternate instructions are only supported by BinaryOperator "
1013 if ((E == 2 || SwappedPredsCompatible) &&
1014 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1019 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1023 }
else if (BasePred != CurrentPred) {
1026 "CmpInst isn't safe for alternation, logic needs to be updated!");
1031 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1032 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1035 }
else if (InstOpcode == Opcode) {
1036 assert(InstOpcode == AltOpcode &&
1037 "Alternate instructions are only supported by BinaryOperator and "
1039 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1040 if (Gep->getNumOperands() != 2 ||
1041 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1042 return InstructionsState::invalid();
1043 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1045 return InstructionsState::invalid();
1046 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1047 auto *BaseLI = cast<LoadInst>(IBase);
1048 if (!LI->isSimple() || !BaseLI->isSimple())
1049 return InstructionsState::invalid();
1050 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1051 auto *
CallBase = cast<CallInst>(IBase);
1053 return InstructionsState::invalid();
1054 if (Call->hasOperandBundles() &&
1056 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1057 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1060 return InstructionsState::invalid();
1063 return InstructionsState::invalid();
1066 if (Mappings.
size() != BaseMappings.
size() ||
1067 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1068 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1069 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1070 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1071 Mappings.
front().Shape.Parameters !=
1072 BaseMappings.
front().Shape.Parameters)
1073 return InstructionsState::invalid();
1078 return InstructionsState::invalid();
1081 return InstructionsState(cast<Instruction>(V),
1082 cast<Instruction>(VL[AltIndex]));
1099 unsigned Opcode = UserInst->
getOpcode();
1101 case Instruction::Load: {
1102 LoadInst *LI = cast<LoadInst>(UserInst);
1105 case Instruction::Store: {
1106 StoreInst *SI = cast<StoreInst>(UserInst);
1107 return (SI->getPointerOperand() == Scalar);
1109 case Instruction::Call: {
1110 CallInst *CI = cast<CallInst>(UserInst);
1113 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1114 Arg.value().get() == Scalar;
1126 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1133 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1134 return LI->isSimple();
1136 return SI->isSimple();
1138 return !
MI->isVolatile();
1146 bool ExtendingManyInputs =
false) {
1147 if (SubMask.
empty())
1150 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1153 "SubMask with many inputs support must be larger than the mask.");
1155 Mask.append(SubMask.
begin(), SubMask.
end());
1159 int TermValue = std::min(Mask.size(), SubMask.
size());
1160 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1162 (!ExtendingManyInputs &&
1163 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1165 NewMask[
I] = Mask[SubMask[
I]];
1181 const unsigned Sz = Order.
size();
1184 for (
unsigned I = 0;
I < Sz; ++
I) {
1186 UnusedIndices.
reset(Order[
I]);
1188 MaskedIndices.
set(
I);
1190 if (MaskedIndices.
none())
1193 "Non-synced masked/available indices.");
1197 assert(
Idx >= 0 &&
"Indices must be synced.");
1208 Type *ScalarTy = VL[0]->getType();
1211 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1212 if (isa<PoisonValue>(VL[Lane]))
1214 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1215 OpcodeMask.
set(Lane * ScalarTyNumElements,
1216 Lane * ScalarTyNumElements + ScalarTyNumElements);
1226 const unsigned E = Indices.
size();
1228 for (
unsigned I = 0;
I < E; ++
I)
1229 Mask[Indices[
I]] =
I;
1235 assert(!Mask.empty() &&
"Expected non-empty mask.");
1239 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1241 Scalars[Mask[
I]] = Prev[
I];
1249 auto *
I = dyn_cast<Instruction>(V);
1254 auto *IO = dyn_cast<Instruction>(V);
1257 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1266 auto *
I = dyn_cast<Instruction>(V);
1270 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1272 auto *IU = dyn_cast<Instruction>(U);
1275 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1291 return !VL.
empty() &&
1307 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1311namespace slpvectorizer {
1316 struct ScheduleData;
1340 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1341 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1392 return !VectorizableTree.
empty() &&
1393 !VectorizableTree.
front()->UserTreeIndices.empty();
1398 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1399 return VectorizableTree.
front()->Scalars;
1405 const TreeEntry &Root = *VectorizableTree.
front().get();
1406 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1407 !Root.Scalars.front()->getType()->isIntegerTy())
1408 return std::nullopt;
1409 auto It = MinBWs.
find(&Root);
1410 if (It != MinBWs.
end())
1414 if (Root.getOpcode() == Instruction::ZExt ||
1415 Root.getOpcode() == Instruction::SExt)
1416 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1417 Root.getOpcode() == Instruction::SExt);
1418 return std::nullopt;
1424 return MinBWs.
at(VectorizableTree.
front().get()).second;
1429 if (ReductionBitWidth == 0 ||
1430 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1431 ReductionBitWidth >=
1432 DL->getTypeSizeInBits(
1433 VectorizableTree.
front()->Scalars.front()->getType()))
1435 VectorizableTree.
front()->Scalars.front()->getType(),
1436 VectorizableTree.
front()->getVectorFactor());
1439 VectorizableTree.
front()->Scalars.front()->getContext(),
1441 VectorizableTree.
front()->getVectorFactor());
1456 VectorizableTree.
clear();
1457 ScalarToTreeEntry.clear();
1458 MultiNodeScalars.clear();
1460 NonScheduledFirst.
clear();
1461 EntryToLastInstruction.clear();
1462 LoadEntriesToVectorize.
clear();
1463 IsGraphTransformMode =
false;
1464 GatheredLoadsEntriesFirst.reset();
1465 ExternalUses.
clear();
1466 ExternalUsesAsOriginalScalar.clear();
1467 for (
auto &Iter : BlocksSchedules) {
1468 BlockScheduling *BS = Iter.second.get();
1472 ReductionBitWidth = 0;
1474 CastMaxMinBWSizes.reset();
1475 ExtraBitWidthNodes.
clear();
1476 InstrElementSize.clear();
1477 UserIgnoreList =
nullptr;
1478 PostponedGathers.
clear();
1479 ValueToGatherNodes.
clear();
1495 assert(!Order.
empty() &&
"expected non-empty order");
1496 const unsigned Sz = Order.
size();
1498 return P.value() ==
P.index() ||
P.value() == Sz;
1551 return MaxVecRegSize;
1556 return MinVecRegSize;
1564 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1566 return MaxVF ? MaxVF : UINT_MAX;
1618 unsigned *BestVF =
nullptr,
1619 bool TryRecursiveCheck =
true)
const;
1627 template <
typename T>
1654 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1655 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1677 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1678 MaxLevel(MaxLevel) {}
1732 if (isa<LoadInst>(V1)) {
1734 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1739 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1741 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1744 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1747 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1749 ((
int)V1->getNumUses() == NumLanes ||
1750 AllUsersAreInternal(V1, V2)))
1756 auto CheckSameEntryOrFail = [&]() {
1757 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1758 TE1 && TE1 == R.getTreeEntry(V2))
1763 auto *LI1 = dyn_cast<LoadInst>(V1);
1764 auto *LI2 = dyn_cast<LoadInst>(V2);
1766 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1768 return CheckSameEntryOrFail();
1771 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1772 LI2->getPointerOperand(),
DL, SE,
true);
1773 if (!Dist || *Dist == 0) {
1776 R.TTI->isLegalMaskedGather(
1779 return CheckSameEntryOrFail();
1783 if (std::abs(*Dist) > NumLanes / 2)
1792 auto *C1 = dyn_cast<Constant>(V1);
1793 auto *C2 = dyn_cast<Constant>(V2);
1807 if (isa<UndefValue>(V2))
1811 Value *EV2 =
nullptr;
1824 int Dist = Idx2 - Idx1;
1827 if (std::abs(Dist) == 0)
1829 if (std::abs(Dist) > NumLanes / 2)
1836 return CheckSameEntryOrFail();
1839 auto *I1 = dyn_cast<Instruction>(V1);
1840 auto *I2 = dyn_cast<Instruction>(V2);
1842 if (I1->getParent() != I2->getParent())
1843 return CheckSameEntryOrFail();
1850 if (S.getOpcode() &&
1851 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1852 !S.isAltShuffle()) &&
1854 return isa<PoisonValue>(V) ||
1855 cast<Instruction>(V)->getNumOperands() ==
1856 S.getMainOp()->getNumOperands();
1862 if (I1 && isa<PoisonValue>(V2))
1865 if (isa<UndefValue>(V2))
1868 return CheckSameEntryOrFail();
1902 int ShallowScoreAtThisLevel =
1911 auto *I1 = dyn_cast<Instruction>(
LHS);
1912 auto *I2 = dyn_cast<Instruction>(
RHS);
1913 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1915 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1916 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1917 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1918 ShallowScoreAtThisLevel))
1919 return ShallowScoreAtThisLevel;
1920 assert(I1 && I2 &&
"Should have early exited.");
1927 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1928 OpIdx1 != NumOperands1; ++OpIdx1) {
1930 int MaxTmpScore = 0;
1931 unsigned MaxOpIdx2 = 0;
1932 bool FoundBest =
false;
1936 ? I2->getNumOperands()
1937 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1938 assert(FromIdx <= ToIdx &&
"Bad index");
1939 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1941 if (Op2Used.
count(OpIdx2))
1946 I1, I2, CurrLevel + 1, {});
1949 TmpScore > MaxTmpScore) {
1950 MaxTmpScore = TmpScore;
1957 Op2Used.
insert(MaxOpIdx2);
1958 ShallowScoreAtThisLevel += MaxTmpScore;
1961 return ShallowScoreAtThisLevel;
1992 struct OperandData {
1993 OperandData() =
default;
1994 OperandData(
Value *V,
bool APO,
bool IsUsed)
1995 : V(V), APO(APO), IsUsed(IsUsed) {}
2005 bool IsUsed =
false;
2014 enum class ReorderingMode {
2028 unsigned ArgSize = 0;
2034 const Loop *L =
nullptr;
2037 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2038 return OpsVec[OpIdx][Lane];
2042 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2043 return OpsVec[OpIdx][Lane];
2048 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2049 OpIdx != NumOperands; ++OpIdx)
2050 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2052 OpsVec[OpIdx][Lane].IsUsed =
false;
2056 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2057 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2069 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2071 Value *IdxLaneV = getData(
Idx, Lane).V;
2072 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2073 isa<ExtractElementInst>(IdxLaneV))
2076 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2079 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2080 if (!isa<Instruction>(OpIdxLnV))
2084 unsigned UniquesCount = Uniques.
size();
2085 auto IdxIt = Uniques.
find(IdxLaneV);
2086 unsigned UniquesCntWithIdxLaneV =
2087 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2088 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2089 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2090 unsigned UniquesCntWithOpIdxLaneV =
2091 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2092 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2094 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2095 UniquesCntWithOpIdxLaneV,
2096 UniquesCntWithOpIdxLaneV -
2098 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2099 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2100 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2109 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2110 Value *IdxLaneV = getData(
Idx, Lane).V;
2111 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2120 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2121 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2123 return R.areAllUsersVectorized(IdxLaneI)
2131 static const int ScoreScaleFactor = 10;
2139 int Lane,
unsigned OpIdx,
unsigned Idx,
2149 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2150 if (Score <= -SplatScore) {
2154 Score += SplatScore;
2160 Score *= ScoreScaleFactor;
2161 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2179 std::optional<unsigned>
2180 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2184 unsigned NumOperands = getNumOperands();
2187 Value *OpLastLane = getData(OpIdx, LastLane).V;
2190 ReorderingMode RMode = ReorderingModes[OpIdx];
2191 if (RMode == ReorderingMode::Failed)
2192 return std::nullopt;
2195 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2201 std::optional<unsigned>
Idx;
2205 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2211 bool IsUsed = RMode == ReorderingMode::Splat ||
2212 RMode == ReorderingMode::Constant ||
2213 RMode == ReorderingMode::Load;
2215 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2217 OperandData &OpData = getData(
Idx, Lane);
2219 bool OpAPO = OpData.APO;
2228 if (OpAPO != OpIdxAPO)
2233 case ReorderingMode::Load:
2234 case ReorderingMode::Opcode: {
2235 bool LeftToRight = Lane > LastLane;
2236 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2237 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2238 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2239 OpIdx,
Idx, IsUsed, UsedLanes);
2240 if (Score >
static_cast<int>(BestOp.Score) ||
2241 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2244 BestOp.Score = Score;
2245 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2249 case ReorderingMode::Constant:
2250 if (isa<Constant>(
Op) ||
2251 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2253 if (isa<Constant>(
Op)) {
2255 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2258 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2262 case ReorderingMode::Splat:
2263 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2264 IsUsed =
Op == OpLastLane;
2265 if (
Op == OpLastLane) {
2267 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2273 case ReorderingMode::Failed:
2279 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2283 return std::nullopt;
2290 unsigned getBestLaneToStartReordering()
const {
2291 unsigned Min = UINT_MAX;
2292 unsigned SameOpNumber = 0;
2303 for (
int I = getNumLanes();
I > 0; --
I) {
2304 unsigned Lane =
I - 1;
2305 OperandsOrderData NumFreeOpsHash =
2306 getMaxNumOperandsThatCanBeReordered(Lane);
2309 if (NumFreeOpsHash.NumOfAPOs < Min) {
2310 Min = NumFreeOpsHash.NumOfAPOs;
2311 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2313 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2314 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2315 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2318 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2319 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2320 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2321 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2322 auto [It, Inserted] =
2323 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2329 unsigned BestLane = 0;
2330 unsigned CntMin = UINT_MAX;
2332 if (
Data.second.first < CntMin) {
2333 CntMin =
Data.second.first;
2334 BestLane =
Data.second.second;
2341 struct OperandsOrderData {
2344 unsigned NumOfAPOs = UINT_MAX;
2347 unsigned NumOpsWithSameOpcodeParent = 0;
2361 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2362 unsigned CntTrue = 0;
2363 unsigned NumOperands = getNumOperands();
2373 bool AllUndefs =
true;
2374 unsigned NumOpsWithSameOpcodeParent = 0;
2378 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2379 const OperandData &OpData = getData(OpIdx, Lane);
2384 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2386 I->getParent() != Parent) {
2387 if (NumOpsWithSameOpcodeParent == 0) {
2388 NumOpsWithSameOpcodeParent = 1;
2390 Parent =
I->getParent();
2392 --NumOpsWithSameOpcodeParent;
2395 ++NumOpsWithSameOpcodeParent;
2399 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2400 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2404 OperandsOrderData
Data;
2405 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2406 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2414 assert((empty() || VL.
size() == getNumLanes()) &&
2415 "Expected same number of lanes");
2418 constexpr unsigned IntrinsicNumOperands = 2;
2420 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2421 OpsVec.
resize(NumOperands);
2422 unsigned NumLanes = VL.
size();
2423 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2424 OpsVec[OpIdx].
resize(NumLanes);
2425 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2426 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2427 "Expected instruction or poison value");
2438 if (isa<PoisonValue>(VL[Lane])) {
2439 OpsVec[OpIdx][Lane] = {
2444 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2445 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2446 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2453 unsigned getNumOperands()
const {
return ArgSize; }
2456 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2459 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2460 return getData(OpIdx, Lane).V;
2464 bool empty()
const {
return OpsVec.
empty(); }
2467 void clear() { OpsVec.
clear(); }
2472 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2473 assert(
Op == getValue(OpIdx, Lane) &&
2474 "Op is expected to be getValue(OpIdx, Lane).");
2476 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2478 bool OpAPO = getData(OpIdx, Lane).APO;
2479 bool IsInvariant = L && L->isLoopInvariant(
Op);
2481 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2485 bool FoundCandidate =
false;
2486 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2487 OperandData &
Data = getData(OpI, Ln);
2488 if (
Data.APO != OpAPO ||
Data.IsUsed)
2490 Value *OpILane = getValue(OpI, Lane);
2491 bool IsConstantOp = isa<Constant>(OpILane);
2500 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2506 isa<Constant>(
Data.V)))) ||
2513 (IsInvariant && !isa<Constant>(
Data.V) &&
2515 L->isLoopInvariant(
Data.V))) {
2516 FoundCandidate =
true;
2523 if (!FoundCandidate)
2526 return getNumLanes() == 2 || Cnt > 1;
2531 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2532 assert(
Op == getValue(OpIdx, Lane) &&
2533 "Op is expected to be getValue(OpIdx, Lane).");
2534 bool OpAPO = getData(OpIdx, Lane).APO;
2535 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2538 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2539 const OperandData &
Data = getData(OpI, Ln);
2540 if (
Data.APO != OpAPO ||
Data.IsUsed)
2542 Value *OpILn = getValue(OpI, Ln);
2543 return (L && L->isLoopInvariant(OpILn)) ||
2555 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2556 L(R.LI->getLoopFor((VL0->
getParent()))) {
2558 appendOperandsOfVL(RootVL, VL0);
2565 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2566 "Expected same num of lanes across all operands");
2567 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2568 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2576 unsigned NumOperands = getNumOperands();
2577 unsigned NumLanes = getNumLanes();
2597 unsigned FirstLane = getBestLaneToStartReordering();
2600 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2601 Value *OpLane0 = getValue(OpIdx, FirstLane);
2604 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2606 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2607 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2608 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2609 else if (isa<LoadInst>(OpILane0))
2610 ReorderingModes[OpIdx] = ReorderingMode::Load;
2612 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2613 }
else if (isa<Constant>(OpLane0)) {
2614 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2615 }
else if (isa<Argument>(OpLane0)) {
2617 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627 auto &&SkipReordering = [
this]() {
2630 for (
const OperandData &
Data : Op0)
2634 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2653 if (SkipReordering())
2656 bool StrategyFailed =
false;
2664 for (
unsigned I = 0;
I < NumOperands; ++
I)
2665 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2668 UsedLanes.
set(FirstLane);
2669 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2672 int Lane = FirstLane +
Direction * Distance;
2673 if (Lane < 0 || Lane >= (
int)NumLanes)
2675 UsedLanes.
set(Lane);
2677 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2680 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2682 std::optional<unsigned> BestIdx =
2683 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2684 MainAltOps[OpIdx], UsedLanes);
2691 swap(OpIdx, *BestIdx, Lane);
2694 StrategyFailed =
true;
2697 if (MainAltOps[OpIdx].
size() != 2) {
2698 OperandData &AltOp = getData(OpIdx, Lane);
2699 InstructionsState OpS =
2701 if (OpS.getOpcode() && OpS.isAltShuffle())
2708 if (!StrategyFailed)
2713#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2716 case ReorderingMode::Load:
2718 case ReorderingMode::Opcode:
2720 case ReorderingMode::Constant:
2722 case ReorderingMode::Splat:
2724 case ReorderingMode::Failed:
2745 const unsigned Indent = 2;
2748 OS <<
"Operand " << Cnt++ <<
"\n";
2749 for (
const OperandData &OpData : OpDataVec) {
2751 if (
Value *V = OpData.V)
2755 OS <<
", APO:" << OpData.APO <<
"}\n";
2777 int BestScore = Limit;
2778 std::optional<int> Index;
2779 for (
int I : seq<int>(0, Candidates.size())) {
2781 Candidates[
I].second,
2784 if (Score > BestScore) {
2799 DeletedInstructions.insert(
I);
2804 template <
typename T>
2807 for (
T *V : DeadVals) {
2808 auto *
I = cast<Instruction>(V);
2809 DeletedInstructions.insert(
I);
2812 for (
T *V : DeadVals) {
2813 if (!V || !Processed.
insert(V).second)
2815 auto *
I = cast<Instruction>(V);
2818 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2819 Entries.push_back(Entry);
2820 auto It = MultiNodeScalars.find(
I);
2821 if (It != MultiNodeScalars.end())
2822 Entries.append(It->second.begin(), It->second.end());
2824 for (
Use &U :
I->operands()) {
2825 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2826 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2828 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2829 return Entry->VectorizedValue == OpI;
2833 I->dropAllReferences();
2835 for (
T *V : DeadVals) {
2836 auto *
I = cast<Instruction>(V);
2837 if (!
I->getParent())
2842 cast<Instruction>(U.getUser()));
2844 "trying to erase instruction with users.");
2845 I->removeFromParent();
2849 while (!DeadInsts.
empty()) {
2852 if (!VI || !VI->getParent())
2855 "Live instruction found in dead worklist!");
2856 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2863 for (
Use &OpU : VI->operands()) {
2864 Value *OpV = OpU.get();
2875 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2876 if (!DeletedInstructions.contains(OpI) &&
2881 VI->removeFromParent();
2882 DeletedInstructions.insert(VI);
2890 return AnalyzedReductionsRoots.count(
I);
2895 AnalyzedReductionsRoots.insert(
I);
2909 AnalyzedReductionsRoots.clear();
2910 AnalyzedReductionVals.
clear();
2911 AnalyzedMinBWVals.
clear();
2923 return NonScheduledFirst.
contains(V);
2936 bool collectValuesToDemote(
2937 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2940 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2950 canReorderOperands(TreeEntry *UserTE,
2957 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2961 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2963 TreeEntry *TE =
nullptr;
2965 TE = getTreeEntry(V);
2966 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2968 auto It = MultiNodeScalars.find(V);
2969 if (It != MultiNodeScalars.end()) {
2970 for (TreeEntry *E : It->second) {
2971 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2979 if (It != VL.
end()) {
2980 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2988 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2989 unsigned OpIdx)
const {
2990 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2991 const_cast<TreeEntry *
>(UserTE), OpIdx);
2995 bool areAllUsersVectorized(
3004 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3009 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3013 getCastContextHint(
const TreeEntry &TE)
const;
3022 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3033 bool ResizeAllowed =
false)
const;
3042 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3043 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3044 unsigned NodeIdx)
const {
3045 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3052 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3057 template <
typename BVTy,
typename ResTy,
typename...
Args>
3058 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3063 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3064 bool PostponedPHIs);
3070 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3077 std::optional<TargetTransformInfo::ShuffleKind>
3089 unsigned NumParts)
const;
3101 std::optional<TargetTransformInfo::ShuffleKind>
3102 isGatherShuffledSingleRegisterEntry(
3119 isGatherShuffledEntry(
3122 unsigned NumParts,
bool ForOrder =
false);
3128 Type *ScalarTy)
const;
3132 void setInsertPointAfterBundle(
const TreeEntry *E);
3142 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3147 void tryToVectorizeGatheredLoads(
3156 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3172 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3176 void reorderGatherNode(TreeEntry &TE);
3180 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3197 [Scalars](
Value *V,
int Idx) {
3198 return (isa<UndefValue>(V) &&
3199 Idx == PoisonMaskElem) ||
3200 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3203 if (!ReorderIndices.empty()) {
3210 return IsSame(Scalars, Mask);
3211 if (VL.
size() == ReuseShuffleIndices.size()) {
3213 return IsSame(Scalars, Mask);
3217 return IsSame(Scalars, ReuseShuffleIndices);
3220 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3221 return isGather() && !UserTreeIndices.empty() &&
3222 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3223 UserTreeIndices.front().UserTE == UserEI.UserTE;
3227 bool hasEqualOperands(
const TreeEntry &TE)
const {
3228 if (
TE.getNumOperands() != getNumOperands())
3231 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3232 unsigned PrevCount =
Used.count();
3233 for (
unsigned K = 0;
K < E; ++
K) {
3236 if (getOperand(K) ==
TE.getOperand(
I)) {
3242 if (PrevCount ==
Used.count())
3251 unsigned getVectorFactor()
const {
3252 if (!ReuseShuffleIndices.empty())
3253 return ReuseShuffleIndices.size();
3254 return Scalars.
size();
3258 bool isGather()
const {
return State == NeedToGather; }
3285 enum CombinedOpcode {
3287 MinMax = Instruction::OtherOpsEnd + 1,
3289 CombinedOpcode CombinedOp = NotCombinedOp;
3303 VecTreeTy &Container;
3327 unsigned InterleaveFactor = 0;
3331 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3333 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3339 assert(Operands[OpIdx].empty() &&
"Already resized?");
3341 "Number of operands is greater than the number of scalars.");
3347 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3348 VLOperands Ops(Scalars, MainOp, R);
3352 setOperand(
I, Ops.getVL(
I));
3374 unsigned getNumOperands()
const {
return Operands.size(); }
3377 Value *getSingleOperand(
unsigned OpIdx)
const {
3379 assert(!Operands[OpIdx].empty() &&
"No operand available");
3384 bool isAltShuffle()
const {
return MainOp != AltOp; }
3387 unsigned CheckedOpcode =
I->getOpcode();
3388 return (getOpcode() == CheckedOpcode ||
3389 getAltOpcode() == CheckedOpcode);
3396 auto *
I = dyn_cast<Instruction>(
Op);
3397 if (
I && isOpcodeOrAlt(
I))
3402 void setOperations(
const InstructionsState &S) {
3403 MainOp = S.getMainOp();
3404 AltOp = S.getAltOp();
3416 unsigned getOpcode()
const {
3417 return MainOp ? MainOp->
getOpcode() : 0;
3420 unsigned getAltOpcode()
const {
3426 int findLaneForValue(
Value *V)
const {
3427 unsigned FoundLane = getVectorFactor();
3428 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3429 std::advance(It, 1)) {
3432 FoundLane = std::distance(Scalars.begin(), It);
3433 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3434 if (!ReorderIndices.
empty())
3435 FoundLane = ReorderIndices[FoundLane];
3436 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3437 if (ReuseShuffleIndices.
empty())
3439 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3440 RIt != ReuseShuffleIndices.
end()) {
3441 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3445 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3458 bool isNonPowOf2Vec()
const {
3460 return IsNonPowerOf2;
3469 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3470 "Reshuffling not supported with non-power-of-2 vectors yet.");
3471 return IsNonPowerOf2;
3474 Value *getOrdered(
unsigned Idx)
const {
3475 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3476 if (ReorderIndices.
empty())
3477 return Scalars[
Idx];
3487 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3488 dbgs() <<
"Operand " << OpI <<
":\n";
3489 for (
const Value *V : Operands[OpI])
3492 dbgs() <<
"Scalars: \n";
3493 for (
Value *V : Scalars)
3495 dbgs() <<
"State: ";
3498 if (InterleaveFactor > 0) {
3499 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3502 dbgs() <<
"Vectorize\n";
3505 case ScatterVectorize:
3506 dbgs() <<
"ScatterVectorize\n";
3508 case StridedVectorize:
3509 dbgs() <<
"StridedVectorize\n";
3512 dbgs() <<
"NeedToGather\n";
3514 case CombinedVectorize:
3515 dbgs() <<
"CombinedVectorize\n";
3518 dbgs() <<
"MainOp: ";
3520 dbgs() << *MainOp <<
"\n";
3523 dbgs() <<
"AltOp: ";
3525 dbgs() << *AltOp <<
"\n";
3528 dbgs() <<
"VectorizedValue: ";
3529 if (VectorizedValue)
3530 dbgs() << *VectorizedValue <<
"\n";
3533 dbgs() <<
"ReuseShuffleIndices: ";
3534 if (ReuseShuffleIndices.
empty())
3537 for (
int ReuseIdx : ReuseShuffleIndices)
3538 dbgs() << ReuseIdx <<
", ";
3540 dbgs() <<
"ReorderIndices: ";
3541 for (
unsigned ReorderIdx : ReorderIndices)
3542 dbgs() << ReorderIdx <<
", ";
3544 dbgs() <<
"UserTreeIndices: ";
3545 for (
const auto &EInfo : UserTreeIndices)
3546 dbgs() << EInfo <<
", ";
3548 if (!CombinedEntriesWithIndices.
empty()) {
3549 dbgs() <<
"Combined entries: ";
3551 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3560 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3563 dbgs() <<
"SLP: " << Banner <<
":\n";
3565 dbgs() <<
"SLP: Costs:\n";
3566 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3567 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3568 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3569 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3570 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3576 std::optional<ScheduleData *> Bundle,
3577 const InstructionsState &S,
3578 const EdgeInfo &UserTreeIdx,
3581 unsigned InterleaveFactor = 0) {
3582 TreeEntry::EntryState EntryState =
3583 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3584 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3585 ReuseShuffleIndices, ReorderIndices);
3586 if (E && InterleaveFactor > 0)
3587 E->setInterleave(InterleaveFactor);
3592 TreeEntry::EntryState EntryState,
3593 std::optional<ScheduleData *> Bundle,
3594 const InstructionsState &S,
3595 const EdgeInfo &UserTreeIdx,
3598 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3599 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3600 "Need to vectorize gather entry?");
3602 if (GatheredLoadsEntriesFirst.has_value() &&
3603 EntryState == TreeEntry::NeedToGather &&
3604 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3605 !UserTreeIdx.UserTE)
3607 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3608 TreeEntry *
Last = VectorizableTree.
back().get();
3609 Last->Idx = VectorizableTree.
size() - 1;
3610 Last->State = EntryState;
3615 ReuseShuffleIndices.empty()) &&
3616 "Reshuffling scalars not yet supported for nodes with padding");
3617 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3618 ReuseShuffleIndices.end());
3619 if (ReorderIndices.
empty()) {
3621 Last->setOperations(S);
3624 Last->Scalars.assign(VL.
size(),
nullptr);
3627 if (Idx >= VL.size())
3628 return UndefValue::get(VL.front()->getType());
3632 Last->setOperations(S);
3633 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3635 if (!
Last->isGather()) {
3636 for (
Value *V : VL) {
3637 const TreeEntry *
TE = getTreeEntry(V);
3639 "Scalar already in tree!");
3642 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3645 ScalarToTreeEntry[
V] =
Last;
3648 ScheduleData *BundleMember = *Bundle;
3649 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3652 "Bundle and VL out of sync");
3654 for (
Value *V : VL) {
3659 BundleMember->TE =
Last;
3660 BundleMember = BundleMember->NextInBundle;
3663 assert(!BundleMember &&
"Bundle and VL out of sync");
3666 bool AllConstsOrCasts =
true;
3669 auto *
I = dyn_cast<CastInst>(V);
3670 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3671 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3672 !UserTreeIdx.UserTE->isGather())
3675 if (AllConstsOrCasts)
3677 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3678 MustGather.
insert(VL.begin(), VL.end());
3681 if (UserTreeIdx.UserTE)
3682 Last->UserTreeIndices.push_back(UserTreeIdx);
3688 TreeEntry::VecTreeTy VectorizableTree;
3693 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3694 VectorizableTree[
Id]->dump();
3700 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3702 const TreeEntry *getTreeEntry(
Value *V)
const {
3703 return ScalarToTreeEntry.lookup(V);
3712 bool areAltOperandsProfitable(
const InstructionsState &S,
3717 TreeEntry::EntryState
3719 bool IsScatterVectorizeUserTE,
3752 using ValueToGatherNodesMap =
3754 ValueToGatherNodesMap ValueToGatherNodes;
3762 bool IsGraphTransformMode =
false;
3765 std::optional<unsigned> GatheredLoadsEntriesFirst;
3768 struct ExternalUser {
3792 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3793 auto It = AliasCache.
find(Key);
3794 if (It != AliasCache.
end())
3799 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3803 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3835 UserList ExternalUses;
3858 struct ScheduleData {
3861 enum { InvalidDeps = -1 };
3863 ScheduleData() =
default;
3866 FirstInBundle =
this;
3867 NextInBundle =
nullptr;
3868 NextLoadStore =
nullptr;
3869 IsScheduled =
false;
3870 SchedulingRegionID = BlockSchedulingRegionID;
3871 clearDependencies();
3878 if (hasValidDependencies()) {
3879 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3881 assert(UnscheduledDeps == Dependencies &&
"invariant");
3885 assert(isSchedulingEntity() &&
3886 "unexpected scheduled state");
3887 for (
const ScheduleData *BundleMember =
this; BundleMember;
3888 BundleMember = BundleMember->NextInBundle) {
3889 assert(BundleMember->hasValidDependencies() &&
3890 BundleMember->UnscheduledDeps == 0 &&
3891 "unexpected scheduled state");
3892 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3893 "only bundle is marked scheduled");
3897 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3898 "all bundle members must be in same basic block");
3904 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3908 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3912 bool isPartOfBundle()
const {
3913 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3918 bool isReady()
const {
3919 assert(isSchedulingEntity() &&
3920 "can't consider non-scheduling entity for ready list");
3921 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3927 int incrementUnscheduledDeps(
int Incr) {
3928 assert(hasValidDependencies() &&
3929 "increment of unscheduled deps would be meaningless");
3930 UnscheduledDeps += Incr;
3931 return FirstInBundle->unscheduledDepsInBundle();
3936 void resetUnscheduledDeps() {
3937 UnscheduledDeps = Dependencies;
3941 void clearDependencies() {
3942 Dependencies = InvalidDeps;
3943 resetUnscheduledDeps();
3944 MemoryDependencies.clear();
3945 ControlDependencies.clear();
3948 int unscheduledDepsInBundle()
const {
3949 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3951 for (
const ScheduleData *BundleMember =
this; BundleMember;
3952 BundleMember = BundleMember->NextInBundle) {
3953 if (BundleMember->UnscheduledDeps == InvalidDeps)
3955 Sum += BundleMember->UnscheduledDeps;
3961 if (!isSchedulingEntity()) {
3962 os <<
"/ " << *Inst;
3963 }
else if (NextInBundle) {
3965 ScheduleData *SD = NextInBundle;
3967 os <<
';' << *SD->Inst;
3968 SD = SD->NextInBundle;
3979 TreeEntry *
TE =
nullptr;
3983 ScheduleData *FirstInBundle =
nullptr;
3987 ScheduleData *NextInBundle =
nullptr;
3991 ScheduleData *NextLoadStore =
nullptr;
4005 int SchedulingRegionID = 0;
4008 int SchedulingPriority = 0;
4014 int Dependencies = InvalidDeps;
4020 int UnscheduledDeps = InvalidDeps;
4024 bool IsScheduled =
false;
4029 const BoUpSLP::ScheduleData &SD) {
4054 struct BlockScheduling {
4056 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4060 ScheduleStart =
nullptr;
4061 ScheduleEnd =
nullptr;
4062 FirstLoadStoreInRegion =
nullptr;
4063 LastLoadStoreInRegion =
nullptr;
4064 RegionHasStackSave =
false;
4068 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4071 ScheduleRegionSize = 0;
4075 ++SchedulingRegionID;
4079 if (BB !=
I->getParent())
4082 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4083 if (SD && isInSchedulingRegion(SD))
4088 ScheduleData *getScheduleData(
Value *V) {
4089 if (
auto *
I = dyn_cast<Instruction>(V))
4090 return getScheduleData(
I);
4094 bool isInSchedulingRegion(ScheduleData *SD)
const {
4095 return SD->SchedulingRegionID == SchedulingRegionID;
4100 template <
typename ReadyListType>
4101 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4102 SD->IsScheduled =
true;
4105 for (ScheduleData *BundleMember = SD; BundleMember;
4106 BundleMember = BundleMember->NextInBundle) {
4111 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4112 ScheduleData *OpDef = getScheduleData(
I);
4113 if (OpDef && OpDef->hasValidDependencies() &&
4114 OpDef->incrementUnscheduledDeps(-1) == 0) {
4118 ScheduleData *DepBundle = OpDef->FirstInBundle;
4119 assert(!DepBundle->IsScheduled &&
4120 "already scheduled bundle gets ready");
4121 ReadyList.insert(DepBundle);
4123 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4130 if (TreeEntry *TE = BundleMember->TE) {
4132 int Lane = std::distance(
TE->Scalars.begin(),
4133 find(
TE->Scalars, BundleMember->Inst));
4134 assert(Lane >= 0 &&
"Lane not set");
4142 auto *
In = BundleMember->Inst;
4145 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4146 In->getNumOperands() ==
TE->getNumOperands()) &&
4147 "Missed TreeEntry operands?");
4150 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4151 OpIdx != NumOperands; ++OpIdx)
4152 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4157 for (
Use &U : BundleMember->Inst->operands())
4158 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4162 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4163 if (MemoryDepSD->hasValidDependencies() &&
4164 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4167 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4168 assert(!DepBundle->IsScheduled &&
4169 "already scheduled bundle gets ready");
4170 ReadyList.insert(DepBundle);
4172 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4176 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4177 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4180 ScheduleData *DepBundle = DepSD->FirstInBundle;
4181 assert(!DepBundle->IsScheduled &&
4182 "already scheduled bundle gets ready");
4183 ReadyList.insert(DepBundle);
4185 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4196 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4197 ScheduleStart->comesBefore(ScheduleEnd) &&
4198 "Not a valid scheduling region?");
4200 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4201 auto *SD = getScheduleData(
I);
4204 assert(isInSchedulingRegion(SD) &&
4205 "primary schedule data not in window?");
4206 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4207 "entire bundle in window!");
4211 for (
auto *SD : ReadyInsts) {
4212 assert(SD->isSchedulingEntity() && SD->isReady() &&
4213 "item in ready list not ready?");
4219 template <
typename ReadyListType>
4220 void initialFillReadyList(ReadyListType &ReadyList) {
4221 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4222 ScheduleData *SD = getScheduleData(
I);
4223 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4225 ReadyList.insert(SD);
4227 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4241 std::optional<ScheduleData *>
4243 const InstructionsState &S);
4249 ScheduleData *allocateScheduleDataChunks();
4253 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4258 ScheduleData *PrevLoadStore,
4259 ScheduleData *NextLoadStore);
4263 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4267 void resetSchedule();
4297 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4301 ScheduleData *LastLoadStoreInRegion =
nullptr;
4306 bool RegionHasStackSave =
false;
4309 int ScheduleRegionSize = 0;
4318 int SchedulingRegionID = 1;
4326 void scheduleBlock(BlockScheduling *BS);
4333 struct OrdersTypeDenseMapInfo {
4346 static unsigned getHashValue(
const OrdersType &V) {
4367 unsigned MaxVecRegSize;
4368 unsigned MinVecRegSize;
4383 unsigned ReductionBitWidth = 0;
4386 unsigned BaseGraphSize = 1;
4390 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4409 struct ChildIteratorType
4411 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4422 return R.VectorizableTree[0].get();
4426 return {
N->UserTreeIndices.begin(),
N->Container};
4430 return {
N->UserTreeIndices.end(),
N->Container};
4435 class nodes_iterator {
4446 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4450 return nodes_iterator(R->VectorizableTree.begin());
4454 return nodes_iterator(R->VectorizableTree.end());
4457 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4468 OS << Entry->Idx <<
".\n";
4471 for (
auto *V : Entry->Scalars) {
4473 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4474 return EU.Scalar == V;
4484 if (Entry->isGather())
4486 if (Entry->State == TreeEntry::ScatterVectorize ||
4487 Entry->State == TreeEntry::StridedVectorize)
4488 return "color=blue";
4497 for (
auto *
I : DeletedInstructions) {
4498 if (!
I->getParent()) {
4501 if (isa<PHINode>(
I))
4503 I->insertBefore(
F->getEntryBlock(),
4504 F->getEntryBlock().getFirstNonPHIIt());
4506 I->insertBefore(
F->getEntryBlock().getTerminator());
4509 for (
Use &U :
I->operands()) {
4510 auto *
Op = dyn_cast<Instruction>(U.get());
4511 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4515 I->dropAllReferences();
4517 for (
auto *
I : DeletedInstructions) {
4519 "trying to erase instruction with users.");
4520 I->eraseFromParent();
4526#ifdef EXPENSIVE_CHECKS
4537 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4538 "Expected non-empty mask.");
4541 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4543 Reuses[Mask[
I]] = Prev[
I];
4551 bool BottomOrder =
false) {
4552 assert(!Mask.empty() &&
"Expected non-empty mask.");
4553 unsigned Sz = Mask.size();
4556 if (Order.
empty()) {
4558 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4560 PrevOrder.
swap(Order);
4563 for (
unsigned I = 0;
I < Sz; ++
I)
4565 Order[
I] = PrevOrder[Mask[
I]];
4567 return Data.value() == Sz ||
Data.index() ==
Data.value();
4576 if (Order.
empty()) {
4578 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4588 for (
unsigned I = 0;
I < Sz; ++
I)
4590 Order[MaskOrder[
I]] =
I;
4594std::optional<BoUpSLP::OrdersType>
4596 assert(TE.isGather() &&
"Expected gather node only.");
4600 Type *ScalarTy = GatheredScalars.
front()->getType();
4601 int NumScalars = GatheredScalars.
size();
4603 return std::nullopt;
4606 if (NumParts == 0 || NumParts >= NumScalars ||
4607 VecTy->getNumElements() % NumParts != 0 ||
4609 VecTy->getNumElements() / NumParts))
4615 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4617 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4620 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4621 return std::nullopt;
4622 OrdersType CurrentOrder(NumScalars, NumScalars);
4623 if (GatherShuffles.
size() == 1 &&
4625 Entries.front().front()->isSame(TE.Scalars)) {
4628 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4629 return CurrentOrder;
4633 return all_of(Mask, [&](
int I) {
4640 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4641 (Entries.size() != 1 ||
4642 Entries.front().front()->ReorderIndices.empty())) ||
4643 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4644 return std::nullopt;
4649 for (
int I : seq<int>(0, NumParts)) {
4650 if (ShuffledSubMasks.
test(
I))
4652 const int VF = GetVF(
I);
4658 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4659 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4660 ShuffledSubMasks.
set(
I);
4664 int FirstMin = INT_MAX;
4665 int SecondVecFound =
false;
4666 for (
int K : seq<int>(Limit)) {
4667 int Idx = Mask[
I * PartSz + K];
4669 Value *V = GatheredScalars[
I * PartSz + K];
4671 SecondVecFound =
true;
4680 SecondVecFound =
true;
4684 FirstMin = (FirstMin / PartSz) * PartSz;
4686 if (SecondVecFound) {
4687 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4688 ShuffledSubMasks.
set(
I);
4691 for (
int K : seq<int>(Limit)) {
4692 int Idx = Mask[
I * PartSz + K];
4696 if (
Idx >= PartSz) {
4697 SecondVecFound =
true;
4700 if (CurrentOrder[
I * PartSz +
Idx] >
4701 static_cast<unsigned>(
I * PartSz + K) &&
4702 CurrentOrder[
I * PartSz +
Idx] !=
4703 static_cast<unsigned>(
I * PartSz +
Idx))
4704 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4707 if (SecondVecFound) {
4708 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4709 ShuffledSubMasks.
set(
I);
4715 if (!ExtractShuffles.
empty())
4716 TransformMaskToOrder(
4717 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4718 if (!ExtractShuffles[
I])
4721 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4722 for (
unsigned Idx : seq<unsigned>(Sz)) {
4723 int K =
I * PartSz +
Idx;
4726 if (!TE.ReuseShuffleIndices.empty())
4727 K = TE.ReuseShuffleIndices[K];
4730 if (!TE.ReorderIndices.empty())
4731 K = std::distance(TE.ReorderIndices.begin(),
4732 find(TE.ReorderIndices, K));
4733 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4736 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4738 .getKnownMinValue());
4743 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4744 if (ShuffledSubMasks.
any())
4745 return std::nullopt;
4746 PartSz = NumScalars;
4749 if (!Entries.empty())
4750 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4751 if (!GatherShuffles[
I])
4753 return std::max(Entries[
I].front()->getVectorFactor(),
4754 Entries[
I].back()->getVectorFactor());
4757 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4758 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4759 return std::nullopt;
4760 return std::move(CurrentOrder);
4765 bool CompareOpcodes =
true) {
4769 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4770 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4771 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4772 (!GEP2 || GEP2->getNumOperands() == 2) &&
4773 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4774 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4777 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4782template <
typename T>
4784 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4786 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4787 return CommonAlignment;
4793 "Order is empty. Please check it before using isReverseOrder.");
4794 unsigned Sz = Order.
size();
4796 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4807static std::optional<Value *>
4813 const SCEV *PtrSCEVLowest =
nullptr;
4814 const SCEV *PtrSCEVHighest =
nullptr;
4820 return std::nullopt;
4822 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4823 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4827 if (isa<SCEVCouldNotCompute>(Diff))
4828 return std::nullopt;
4830 PtrSCEVLowest = PtrSCEV;
4834 if (isa<SCEVCouldNotCompute>(Diff1))
4835 return std::nullopt;
4837 PtrSCEVHighest = PtrSCEV;
4843 if (isa<SCEVCouldNotCompute>(Dist))
4844 return std::nullopt;
4845 int Size =
DL.getTypeStoreSize(ElemTy);
4846 auto TryGetStride = [&](
const SCEV *Dist,
4847 const SCEV *Multiplier) ->
const SCEV * {
4848 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4849 if (M->getOperand(0) == Multiplier)
4850 return M->getOperand(1);
4851 if (M->getOperand(1) == Multiplier)
4852 return M->getOperand(0);
4855 if (Multiplier == Dist)
4860 const SCEV *Stride =
nullptr;
4861 if (
Size != 1 || SCEVs.
size() > 2) {
4863 Stride = TryGetStride(Dist, Sz);
4865 return std::nullopt;
4867 if (!Stride || isa<SCEVConstant>(Stride))
4868 return std::nullopt;
4871 using DistOrdPair = std::pair<int64_t, int>;
4873 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4875 bool IsConsecutive =
true;
4876 for (
const SCEV *PtrSCEV : SCEVs) {
4878 if (PtrSCEV != PtrSCEVLowest) {
4880 const SCEV *Coeff = TryGetStride(Diff, Stride);
4882 return std::nullopt;
4883 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4884 if (!SC || isa<SCEVCouldNotCompute>(SC))
4885 return std::nullopt;
4889 return std::nullopt;
4890 Dist = SC->getAPInt().getZExtValue();
4894 return std::nullopt;
4895 auto Res = Offsets.emplace(Dist, Cnt);
4897 return std::nullopt;
4899 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4902 if (Offsets.size() != SCEVs.
size())
4903 return std::nullopt;
4904 SortedIndices.
clear();
4905 if (!IsConsecutive) {
4909 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4910 SortedIndices[Cnt] = Pair.second;
4920static std::pair<InstructionCost, InstructionCost>
4936 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4939 Mask, NumSrcElts, NumSubElts,
Index)) {
4940 if (
Index + NumSubElts > NumSrcElts &&
4941 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4954 unsigned *BestVF,
bool TryRecursiveCheck)
const {
4967 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4973 const unsigned Sz = VL.
size();
4975 auto *POIter = PointerOps.
begin();
4976 for (
Value *V : VL) {
4977 auto *L = dyn_cast<LoadInst>(V);
4978 if (!L || !L->isSimple())
4980 *POIter = L->getPointerOperand();
4989 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5009 if (Order.
empty()) {
5010 Ptr0 = PointerOps.
front();
5011 PtrN = PointerOps.
back();
5013 Ptr0 = PointerOps[Order.
front()];
5014 PtrN = PointerOps[Order.
back()];
5016 std::optional<int> Diff =
5019 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5025 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5039 auto IsAnyPointerUsedOutGraph =
5040 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5041 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5042 return !getTreeEntry(U) && !MustGather.contains(U);
5045 const unsigned AbsoluteDiff = std::abs(*Diff);
5046 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5050 AbsoluteDiff > Sz) ||
5051 *Diff == -(
static_cast<int>(Sz) - 1))) {
5052 int Stride = *Diff /
static_cast<int>(Sz - 1);
5053 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5065 else if (
Ptr != Ptr0)
5069 if (((Dist / Stride) * Stride) != Dist ||
5070 !Dists.
insert(Dist).second)
5073 if (Dists.
size() == Sz)
5082 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5084 bool ProfitableGatherPointers) {
5089 auto [ScalarGEPCost, VectorGEPCost] =
5091 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5097 VecTy->getNumElements());
5098 if (
static_cast<unsigned>(
count_if(
5099 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5105 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5124 false, CommonAlignment,
CostKind) +
5125 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5132 constexpr unsigned ListLimit = 4;
5133 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5142 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5147 for (
unsigned VF = VL.
size() / 2; VF >= MinVF; VF /= 2) {
5149 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5162 DemandedElts.
setBits(Cnt, Cnt + VF);
5177 if (!DemandedElts.
isZero()) {
5182 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5183 if (DemandedElts[
Idx])
5190 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5195 LI0->getPointerOperand(),
5196 Instruction::GetElementPtr,
CostKind, ScalarTy,
5200 if (
static_cast<unsigned>(
5201 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5202 PointerOps.
size() - 1 ||
5222 LI0->getPointerAddressSpace(),
CostKind,
5228 LI0->getPointerOperand(),
5235 LI0->getPointerOperand(),
5245 for (
int Idx : seq<int>(0, VL.
size()))
5255 if (MaskedGatherCost >= VecLdCost &&
5268 bool ProfitableGatherPointers =
5269 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5270 return L->isLoopInvariant(V);
5272 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5273 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5275 (
GEP &&
GEP->getNumOperands() == 2 &&
5276 isa<Constant, Instruction>(
GEP->getOperand(1)));
5283 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5284 ProfitableGatherPointers))
5297 "Expected list of pointer operands.");
5307 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5309 SortedIndices.
clear();
5311 auto Key = std::make_pair(BBs[Cnt + 1],
5315 std::optional<int> Diff = getPointersDiff(
5316 ElemTy, std::get<0>(Base.front()), ElemTy,
5322 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5328 if (Bases.
size() > VL.
size() / 2 - 1)
5332 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5339 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5340 Bases.
front().second.size() == VL.
size()))
5345 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5355 FirstPointers.
insert(P1);
5356 SecondPointers.
insert(P2);
5362 "Unable to find matching root.");
5365 for (
auto &
Base : Bases) {
5366 for (
auto &Vec :
Base.second) {
5367 if (Vec.size() > 1) {
5368 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5369 const std::tuple<Value *, int, unsigned> &
Y) {
5370 return std::get<1>(
X) < std::get<1>(
Y);
5372 int InitialOffset = std::get<1>(Vec[0]);
5373 bool AnyConsecutive =
5375 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5379 if (!AnyConsecutive)
5384 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5388 for (
auto &
T : Bases)
5389 for (
const auto &Vec :
T.second)
5390 for (
const auto &
P : Vec)
5394 "Expected SortedIndices to be the size of VL");
5398std::optional<BoUpSLP::OrdersType>
5400 assert(TE.isGather() &&
"Expected gather node only.");
5401 Type *ScalarTy = TE.Scalars[0]->getType();
5404 Ptrs.
reserve(TE.Scalars.size());
5406 BBs.
reserve(TE.Scalars.size());
5407 for (
Value *V : TE.Scalars) {
5408 auto *L = dyn_cast<LoadInst>(V);
5409 if (!L || !L->isSimple())
5410 return std::nullopt;
5416 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5418 return std::move(Order);
5419 return std::nullopt;
5430 if (VU->
getType() != V->getType())
5433 if (!VU->
hasOneUse() && !V->hasOneUse())
5439 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5445 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5446 bool IsReusedIdx =
false;
5448 if (IE2 == VU && !IE1)
5450 if (IE1 == V && !IE2)
5451 return V->hasOneUse();
5452 if (IE1 && IE1 != V) {
5454 IsReusedIdx |= ReusedIdx.
test(Idx1);
5455 ReusedIdx.
set(Idx1);
5456 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5459 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5461 if (IE2 && IE2 != VU) {
5463 IsReusedIdx |= ReusedIdx.
test(Idx2);
5464 ReusedIdx.
set(Idx2);
5465 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5468 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5470 }
while (!IsReusedIdx && (IE1 || IE2));
5474std::optional<BoUpSLP::OrdersType>
5478 if (!TE.ReuseShuffleIndices.empty()) {
5480 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5481 "Reshuffling scalars not yet supported for nodes with padding");
5484 return std::nullopt;
5492 unsigned Sz = TE.Scalars.size();
5493 if (TE.isGather()) {
5494 if (std::optional<OrdersType> CurrentOrder =
5499 ::addMask(Mask, TE.ReuseShuffleIndices);
5500 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5501 unsigned Sz = TE.Scalars.size();
5502 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5505 Res[
Idx + K * Sz] =
I + K * Sz;
5507 return std::move(Res);
5510 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5512 2 * TE.getVectorFactor())) == 1)
5513 return std::nullopt;
5517 if (TE.ReorderIndices.empty())
5518 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5521 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5522 unsigned VF = ReorderMask.
size();
5526 for (
unsigned I = 0;
I < VF;
I += Sz) {
5528 unsigned UndefCnt = 0;
5529 unsigned Limit = std::min(Sz, VF -
I);
5538 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5540 return std::nullopt;
5542 for (
unsigned K = 0; K < NumParts; ++K) {
5543 unsigned Idx = Val + Sz * K;
5545 ResOrder[
Idx] =
I + K;
5548 return std::move(ResOrder);
5550 unsigned VF = TE.getVectorFactor();
5553 TE.ReuseShuffleIndices.end());
5554 if (TE.getOpcode() == Instruction::ExtractElement &&
5556 if (isa<PoisonValue>(V))
5558 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5559 return Idx && *Idx < Sz;
5561 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5562 "by BinaryOperator and CastInst.");
5564 if (TE.ReorderIndices.empty())
5565 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5568 for (
unsigned I = 0;
I < VF; ++
I) {
5569 int &
Idx = ReusedMask[
I];
5572 Value *V = TE.Scalars[ReorderMask[
Idx]];
5574 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5580 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5581 auto *It = ResOrder.
begin();
5582 for (
unsigned K = 0; K < VF; K += Sz) {
5586 std::iota(SubMask.begin(), SubMask.end(), 0);
5588 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5589 std::advance(It, Sz);
5592 return Data.index() ==
Data.value();
5594 return std::nullopt;
5595 return std::move(ResOrder);
5597 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5598 any_of(TE.UserTreeIndices,
5600 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5602 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5603 return std::nullopt;
5604 if ((TE.State == TreeEntry::Vectorize ||
5605 TE.State == TreeEntry::StridedVectorize) &&
5606 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5607 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5608 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5609 "BinaryOperator and CastInst.");
5610 return TE.ReorderIndices;
5612 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5613 if (!TE.ReorderIndices.empty())
5614 return TE.ReorderIndices;
5617 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5618 if (!V->hasNUsesOrMore(1))
5620 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5625 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5627 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5633 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5634 auto *NodeA = DT->
getNode(BB1);
5635 auto *NodeB = DT->
getNode(BB2);
5636 assert(NodeA &&
"Should only process reachable instructions");
5637 assert(NodeB &&
"Should only process reachable instructions");
5638 assert((NodeA == NodeB) ==
5639 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5640 "Different nodes should have different DFS numbers");
5641 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5643 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5644 Value *V1 = TE.Scalars[I1];
5645 Value *V2 = TE.Scalars[I2];
5646 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5648 if (isa<PoisonValue>(V1))
5650 if (isa<PoisonValue>(V2))
5656 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5657 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5658 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5659 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5660 FirstUserOfPhi2->getParent());
5661 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5662 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5663 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5664 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5670 if (UserBVHead[I1] && !UserBVHead[I2])
5672 if (!UserBVHead[I1])
5674 if (UserBVHead[I1] == UserBVHead[I2])
5677 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5679 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5686 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5687 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5688 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5689 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5692 if (EE1->getOperand(0) == EE2->getOperand(0))
5694 if (!Inst1 && Inst2)
5696 if (Inst1 && Inst2) {
5704 "Expected either instructions or arguments vector operands.");
5705 return P1->getArgNo() < P2->getArgNo();
5710 std::iota(Phis.
begin(), Phis.
end(), 0);
5713 return std::nullopt;
5714 return std::move(Phis);
5716 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5719 if ((TE.getOpcode() == Instruction::ExtractElement ||
5720 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5721 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5723 auto *EE = dyn_cast<ExtractElementInst>(V);
5724 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5729 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5731 if (Reuse || !CurrentOrder.
empty())
5732 return std::move(CurrentOrder);
5740 int Sz = TE.Scalars.size();
5742 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5744 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5745 if (It == TE.Scalars.begin())
5748 if (It != TE.Scalars.end()) {
5750 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5765 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5768 return std::move(Order);
5773 return std::nullopt;
5774 if (TE.Scalars.size() >= 3)
5779 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5783 CurrentOrder, PointerOps);
5785 return std::move(CurrentOrder);
5791 return CurrentOrder;
5793 return std::nullopt;
5803 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5805 if (Cluster != FirstCluster)
5811void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5814 const unsigned Sz =
TE.Scalars.size();
5816 if (!
TE.isGather() ||
5823 addMask(NewMask,
TE.ReuseShuffleIndices);
5825 TE.ReorderIndices.clear();
5832 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5833 *
End =
TE.ReuseShuffleIndices.end();
5834 It !=
End; std::advance(It, Sz))
5835 std::iota(It, std::next(It, Sz), 0);
5841 "Expected same size of orders");
5842 unsigned Sz = Order.
size();
5844 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5845 if (Order[
Idx] != Sz)
5846 UsedIndices.
set(Order[
Idx]);
5848 if (SecondaryOrder.
empty()) {
5849 for (
unsigned Idx : seq<unsigned>(0, Sz))
5850 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5853 for (
unsigned Idx : seq<unsigned>(0, Sz))
5854 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5855 !UsedIndices.
test(SecondaryOrder[
Idx]))
5856 Order[
Idx] = SecondaryOrder[
Idx];
5876 ExternalUserReorderMap;
5881 const std::unique_ptr<TreeEntry> &TE) {
5884 findExternalStoreUsersReorderIndices(TE.get());
5885 if (!ExternalUserReorderIndices.
empty()) {
5886 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5888 std::move(ExternalUserReorderIndices));
5894 if (TE->isAltShuffle()) {
5897 unsigned Opcode0 = TE->getOpcode();
5898 unsigned Opcode1 = TE->getAltOpcode();
5901 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5902 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5908 if (std::optional<OrdersType> CurrentOrder =
5918 const TreeEntry *UserTE = TE.get();
5920 if (UserTE->UserTreeIndices.size() != 1)
5923 return EI.UserTE->State == TreeEntry::Vectorize &&
5924 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5927 UserTE = UserTE->UserTreeIndices.back().UserTE;
5930 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5931 if (!(TE->State == TreeEntry::Vectorize ||
5932 TE->State == TreeEntry::StridedVectorize) ||
5933 !TE->ReuseShuffleIndices.empty())
5934 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5935 if (TE->State == TreeEntry::Vectorize &&
5936 TE->getOpcode() == Instruction::PHI)
5937 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5942 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
5943 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5944 auto It = VFToOrderedEntries.
find(VF);
5945 if (It == VFToOrderedEntries.
end())
5960 for (
const TreeEntry *OpTE : OrderedEntries) {
5963 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5966 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5968 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5969 auto It = GathersToOrders.find(OpTE);
5970 if (It != GathersToOrders.end())
5973 if (OpTE->isAltShuffle()) {
5974 auto It = AltShufflesToOrders.find(OpTE);
5975 if (It != AltShufflesToOrders.end())
5978 if (OpTE->State == TreeEntry::Vectorize &&
5979 OpTE->getOpcode() == Instruction::PHI) {
5980 auto It = PhisToOrders.
find(OpTE);
5981 if (It != PhisToOrders.
end())
5984 return OpTE->ReorderIndices;
5987 auto It = ExternalUserReorderMap.
find(OpTE);
5988 if (It != ExternalUserReorderMap.
end()) {
5989 const auto &ExternalUserReorderIndices = It->second;
5993 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5994 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5995 ExternalUserReorderIndices.size();
5997 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5998 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6005 if (OpTE->State == TreeEntry::Vectorize &&
6006 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6007 assert(!OpTE->isAltShuffle() &&
6008 "Alternate instructions are only supported by BinaryOperator "
6012 unsigned E = Order.size();
6015 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6018 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6020 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6023 if (OrdersUses.empty())
6026 unsigned IdentityCnt = 0;
6027 unsigned FilledIdentityCnt = 0;
6029 for (
auto &Pair : OrdersUses) {
6031 if (!Pair.first.empty())
6032 FilledIdentityCnt += Pair.second;
6033 IdentityCnt += Pair.second;
6038 unsigned Cnt = IdentityCnt;
6039 for (
auto &Pair : OrdersUses) {
6043 if (Cnt < Pair.second ||
6044 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6045 Cnt == Pair.second && !BestOrder.
empty() &&
6048 BestOrder = Pair.first;
6061 unsigned E = BestOrder.
size();
6063 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6066 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6068 if (TE->Scalars.size() != VF) {
6069 if (TE->ReuseShuffleIndices.size() == VF) {
6075 return EI.UserTE->Scalars.size() == VF ||
6076 EI.UserTE->Scalars.size() ==
6079 "All users must be of VF size.");
6087 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6092 return isa<ShuffleVectorInst>(
6093 EI.UserTE->getMainOp());
6095 "Does not know how to reorder.");
6099 reorderNodeWithReuses(*TE, Mask);
6103 if ((TE->State == TreeEntry::Vectorize ||
6104 TE->State == TreeEntry::StridedVectorize) &&
6107 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6108 assert(!TE->isAltShuffle() &&
6109 "Alternate instructions are only supported by BinaryOperator "
6114 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6115 TE->reorderOperands(Mask);
6118 TE->reorderOperands(Mask);
6119 assert(TE->ReorderIndices.empty() &&
6120 "Expected empty reorder sequence.");
6123 if (!TE->ReuseShuffleIndices.empty()) {
6130 addMask(NewReuses, TE->ReuseShuffleIndices);
6131 TE->ReuseShuffleIndices.swap(NewReuses);
6137bool BoUpSLP::canReorderOperands(
6138 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6141 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6142 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6143 return OpData.first ==
I &&
6144 (OpData.second->State == TreeEntry::Vectorize ||
6145 OpData.second->State == TreeEntry::StridedVectorize);
6148 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6150 if (
any_of(TE->UserTreeIndices,
6151 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6155 Edges.emplace_back(
I, TE);
6161 if (TE->State != TreeEntry::Vectorize &&
6162 TE->State != TreeEntry::StridedVectorize &&
6163 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6167 TreeEntry *
Gather =
nullptr;
6169 [&
Gather, UserTE,
I](TreeEntry *TE) {
6170 assert(TE->State != TreeEntry::Vectorize &&
6171 TE->State != TreeEntry::StridedVectorize &&
6172 "Only non-vectorized nodes are expected.");
6173 if (
any_of(TE->UserTreeIndices,
6174 [UserTE,
I](
const EdgeInfo &EI) {
6175 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6177 assert(TE->isSame(UserTE->getOperand(
I)) &&
6178 "Operand entry does not match operands.");
6199 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6200 if (TE->State != TreeEntry::Vectorize &&
6201 TE->State != TreeEntry::StridedVectorize)
6203 if (std::optional<OrdersType> CurrentOrder =
6205 OrderedEntries.
insert(TE.get());
6206 if (!(TE->State == TreeEntry::Vectorize ||
6207 TE->State == TreeEntry::StridedVectorize) ||
6208 !TE->ReuseShuffleIndices.empty())
6209 GathersToOrders.
insert(TE.get());
6218 while (!OrderedEntries.
empty()) {
6223 for (TreeEntry *TE : OrderedEntries) {
6224 if (!(TE->State == TreeEntry::Vectorize ||
6225 TE->State == TreeEntry::StridedVectorize ||
6226 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6227 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6230 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6232 !Visited.
insert(TE).second) {
6238 for (
EdgeInfo &EI : TE->UserTreeIndices)
6242 for (TreeEntry *TE : Filtered)
6243 OrderedEntries.remove(TE);
6245 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6247 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6248 return Data1.first->Idx > Data2.first->Idx;
6250 for (
auto &
Data : UsersVec) {
6253 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6255 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6256 OrderedEntries.remove(
Op.second);
6269 for (
const auto &
Op :
Data.second) {
6270 TreeEntry *OpTE =
Op.second;
6271 if (!VisitedOps.
insert(OpTE).second)
6273 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6275 const auto Order = [&]() ->
const OrdersType {
6276 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6279 return OpTE->ReorderIndices;
6283 if (Order.size() == 1)
6286 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6287 return P.second == OpTE;
6290 if (OpTE->State == TreeEntry::Vectorize &&
6291 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6292 assert(!OpTE->isAltShuffle() &&
6293 "Alternate instructions are only supported by BinaryOperator "
6297 unsigned E = Order.size();
6300 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6303 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6306 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6308 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6309 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6310 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6311 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6312 (IgnoreReorder && TE->Idx == 0))
6314 if (TE->isGather()) {
6323 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6324 TreeEntry *UserTE = EI.
UserTE;
6325 if (!VisitedUsers.
insert(UserTE).second)
6330 if (AllowsReordering(UserTE))
6338 if (
static_cast<unsigned>(
count_if(
6339 Ops, [UserTE, &AllowsReordering](
6340 const std::pair<unsigned, TreeEntry *> &
Op) {
6341 return AllowsReordering(
Op.second) &&
6344 return EI.UserTE == UserTE;
6346 })) <= Ops.
size() / 2)
6347 ++Res.first->second;
6350 if (OrdersUses.empty()) {
6351 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6352 OrderedEntries.remove(
Op.second);
6356 unsigned IdentityCnt = 0;
6357 unsigned VF =
Data.second.front().second->getVectorFactor();
6359 for (
auto &Pair : OrdersUses) {
6361 IdentityCnt += Pair.second;
6366 unsigned Cnt = IdentityCnt;
6367 for (
auto &Pair : OrdersUses) {
6371 if (Cnt < Pair.second) {
6373 BestOrder = Pair.first;
6381 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6382 OrderedEntries.remove(
Op.second);
6391 unsigned E = BestOrder.
size();
6393 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6395 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6396 TreeEntry *TE =
Op.second;
6397 OrderedEntries.remove(TE);
6398 if (!VisitedOps.
insert(TE).second)
6400 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6401 reorderNodeWithReuses(*TE, Mask);
6405 if (TE->State != TreeEntry::Vectorize &&
6406 TE->State != TreeEntry::StridedVectorize &&
6407 (TE->State != TreeEntry::ScatterVectorize ||
6408 TE->ReorderIndices.empty()))
6410 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6411 TE->ReorderIndices.empty()) &&
6412 "Non-matching sizes of user/operand entries.");
6414 if (IgnoreReorder && TE == VectorizableTree.front().get())
6415 IgnoreReorder =
false;
6418 for (TreeEntry *
Gather : GatherOps) {
6420 "Unexpected reordering of gathers.");
6421 if (!
Gather->ReuseShuffleIndices.empty()) {
6427 OrderedEntries.remove(
Gather);
6431 if (
Data.first->State != TreeEntry::Vectorize ||
6432 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6433 Data.first->getMainOp()) ||
6434 Data.first->isAltShuffle())
6435 Data.first->reorderOperands(Mask);
6436 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6437 Data.first->isAltShuffle() ||
6438 Data.first->State == TreeEntry::StridedVectorize) {
6442 if (
Data.first->ReuseShuffleIndices.empty() &&
6443 !
Data.first->ReorderIndices.empty() &&
6444 !
Data.first->isAltShuffle()) {
6447 OrderedEntries.insert(
Data.first);
6455 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6456 VectorizableTree.front()->ReuseShuffleIndices.empty())
6457 VectorizableTree.front()->ReorderIndices.clear();
6460Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6461 if ((Entry.getOpcode() == Instruction::Store ||
6462 Entry.getOpcode() == Instruction::Load) &&
6463 Entry.State == TreeEntry::StridedVectorize &&
6464 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6465 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6466 return dyn_cast<Instruction>(Entry.Scalars.front());
6473 for (
auto &TEPtr : VectorizableTree) {
6474 TreeEntry *Entry = TEPtr.get();
6477 if (Entry->isGather())
6481 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6482 Value *Scalar = Entry->Scalars[Lane];
6483 if (!isa<Instruction>(Scalar))
6486 auto It = ScalarToExtUses.
find(Scalar);
6487 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6491 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6492 if (ExtI != ExternallyUsedValues.
end()) {
6493 int FoundLane = Entry->findLaneForValue(Scalar);
6494 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6495 << FoundLane <<
" from " << *Scalar <<
".\n");
6496 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6497 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6500 for (
User *U : Scalar->users()) {
6508 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6512 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6516 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6518 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6519 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6521 assert(!UseEntry->isGather() &&
"Bad state");
6525 if (It != ScalarToExtUses.
end()) {
6526 ExternalUses[It->second].User =
nullptr;
6531 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6533 int FoundLane = Entry->findLaneForValue(Scalar);
6535 <<
" from lane " << FoundLane <<
" from " << *Scalar
6537 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6538 ExternalUses.emplace_back(Scalar, U, FoundLane);
6547BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6551 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6552 Value *V = TE->Scalars[Lane];
6554 if (!isa<Instruction>(V))
6561 for (
User *U : V->users()) {
6562 auto *SI = dyn_cast<StoreInst>(U);
6565 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6569 if (getTreeEntry(U))
6574 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6575 SI->getValueOperand()->getType(),
Ptr}];
6578 if (StoresVec.size() > Lane)
6580 if (!StoresVec.empty()) {
6582 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6583 SI->getValueOperand()->getType(),
6584 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6590 StoresVec.push_back(SI);
6595 for (
auto &
P : PtrToStoresMap) {
6596 Res[
I].swap(
P.second);
6603 OrdersType &ReorderIndices)
const {
6614 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6616 std::optional<int> Diff =
6618 SI->getPointerOperand(), *
DL, *SE,
6624 if (StoreOffsetVec.
size() != StoresVec.
size())
6626 sort(StoreOffsetVec,
6627 [](
const std::pair<int, unsigned> &L,
6628 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6631 for (
const auto &
P : StoreOffsetVec) {
6632 if (
Idx > 0 &&
P.first != PrevDist + 1)
6640 ReorderIndices.assign(StoresVec.
size(), 0);
6641 bool IsIdentity =
true;
6643 ReorderIndices[
P.second] =
I;
6644 IsIdentity &=
P.second ==
I;
6650 ReorderIndices.clear();
6657 for (
unsigned Idx : Order)
6664BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6665 unsigned NumLanes =
TE->Scalars.size();
6678 if (StoresVec.
size() != NumLanes)
6683 if (!canFormVector(StoresVec, ReorderIndices))
6688 ExternalReorderIndices.
push_back(ReorderIndices);
6690 return ExternalReorderIndices;
6696 UserIgnoreList = &UserIgnoreLst;
6699 buildTree_rec(Roots, 0,
EdgeInfo());
6706 buildTree_rec(Roots, 0,
EdgeInfo());
6715 bool AddNew =
true) {
6723 for (
Value *V : VL) {
6724 auto *LI = dyn_cast<LoadInst>(V);
6727 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6729 bool IsFound =
false;
6730 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6731 assert(LI->getParent() ==
Data.front().first->getParent() &&
6732 LI->getType() ==
Data.front().first->getType() &&
6736 "Expected loads with the same type, same parent and same "
6737 "underlying pointer.");
6739 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6740 Data.front().first->getPointerOperand(),
DL, SE,
6744 auto It = Map.find(*Dist);
6745 if (It != Map.end() && It->second != LI)
6747 if (It == Map.end()) {
6748 Data.emplace_back(LI, *Dist);
6749 Map.try_emplace(*Dist, LI);
6759 auto FindMatchingLoads =
6764 int &
Offset,
unsigned &Start) {
6766 return GatheredLoads.
end();
6776 std::optional<int> Dist =
6778 Data.front().first->getType(),
6779 Data.front().first->getPointerOperand(),
DL, SE,
6785 for (std::pair<LoadInst *, int>
P :
Data) {
6791 unsigned NumUniques = 0;
6792 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6793 bool Used = DataLoads.
contains(Pair.first);
6794 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6798 Repeated.insert(Cnt);
6801 if (NumUniques > 0 &&
6802 (Loads.
size() == NumUniques ||
6803 (Loads.
size() - NumUniques >= 2 &&
6804 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6810 return std::next(GatheredLoads.
begin(),
Idx);
6814 return GatheredLoads.
end();
6816 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6820 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6822 while (It != GatheredLoads.
end()) {
6823 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6824 for (
unsigned Idx : LocalToAdd)
6826 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6827 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6831 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6835 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6844 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6845 return PD.front().first->getParent() == LI->
getParent() &&
6846 PD.front().first->getType() == LI->
getType();
6848 while (It != GatheredLoads.
end()) {
6851 std::next(It), GatheredLoads.
end(),
6852 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6853 return PD.front().first->getParent() == LI->getParent() &&
6854 PD.front().first->getType() == LI->getType();
6858 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6859 AddNewLoads(GatheredLoads.emplace_back());
6864void BoUpSLP::tryToVectorizeGatheredLoads(
6867 8> &GatheredLoads) {
6868 GatheredLoadsEntriesFirst = VectorizableTree.size();
6871 LoadEntriesToVectorize.
size());
6872 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6873 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6874 VectorizableTree[
Idx]->Scalars.end());
6877 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6878 const std::pair<LoadInst *, int> &L2) {
6879 return L1.second > L2.second;
6885 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6886 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6894 bool Final,
unsigned MaxVF) {
6896 unsigned StartIdx = 0;
6901 *
TTI, Loads.
front()->getType(), MaxVF);
6903 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6909 if (Final && CandidateVFs.
empty())
6912 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6913 for (
unsigned NumElts : CandidateVFs) {
6914 if (Final && NumElts > BestVF)
6917 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6921 if (VectorizedLoads.count(Slice.
front()) ||
6922 VectorizedLoads.count(Slice.
back()) ||
6928 bool AllowToVectorize =
false;
6936 if (LI->hasOneUse())
6942 if (
static_cast<unsigned int>(std::distance(
6943 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6945 if (!IsLegalBroadcastLoad)
6949 for (
User *U : LI->users()) {
6950 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
6952 if (
const TreeEntry *UTE = getTreeEntry(U)) {
6953 for (
int I : seq<int>(UTE->getNumOperands())) {
6954 if (
all_of(UTE->getOperand(
I),
6955 [LI](
Value *V) { return V == LI; }))
6964 AllowToVectorize = CheckIfAllowed(Slice);
6968 any_of(ValueToGatherNodes.at(Slice.front()),
6969 [=](
const TreeEntry *TE) {
6970 return TE->Scalars.size() == 2 &&
6971 ((TE->Scalars.front() == Slice.front() &&
6972 TE->Scalars.back() == Slice.back()) ||
6973 (TE->Scalars.front() == Slice.back() &&
6974 TE->Scalars.back() == Slice.front()));
6979 if (AllowToVectorize) {
6984 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
6986 PointerOps, &BestVF);
6988 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6990 if (MaskedGatherVectorized.
empty() ||
6991 Cnt >= MaskedGatherVectorized.
back() + NumElts)
6996 Results.emplace_back(Values, LS);
6997 VectorizedLoads.insert(Slice.begin(), Slice.end());
7000 if (Cnt == StartIdx)
7001 StartIdx += NumElts;
7004 if (StartIdx >= Loads.
size())
7008 if (!MaskedGatherVectorized.
empty() &&
7009 Cnt < MaskedGatherVectorized.
back() + NumElts)
7015 if (!AllowToVectorize || BestVF == 0)
7019 for (
unsigned Cnt : MaskedGatherVectorized) {
7021 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7025 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7027 if (Cnt == StartIdx)
7028 StartIdx += NumElts;
7032 if (!VectorizedLoads.contains(LI))
7033 NonVectorized.push_back(LI);
7037 auto ProcessGatheredLoads =
7040 bool Final =
false) {
7042 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7043 if (LoadsDists.size() <= 1) {
7044 NonVectorized.
push_back(LoadsDists.back().first);
7050 LoadsDists, OriginalLoads.begin(),
7051 [](
const std::pair<LoadInst *, int> &L) { return L.first; });
7054 unsigned MaxConsecutiveDistance = 0;
7055 unsigned CurrentConsecutiveDist = 1;
7056 int LastDist = LocalLoadsDists.
front().second;
7057 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7058 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7059 if (getTreeEntry(
L.first))
7061 assert(LastDist >=
L.second &&
7062 "Expected first distance always not less than second");
7063 if (
static_cast<unsigned>(LastDist -
L.second) ==
7064 CurrentConsecutiveDist) {
7065 ++CurrentConsecutiveDist;
7066 MaxConsecutiveDistance =
7067 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7071 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7074 CurrentConsecutiveDist = 1;
7075 LastDist =
L.second;
7078 if (Loads.
size() <= 1)
7080 if (AllowMaskedGather)
7081 MaxConsecutiveDistance = Loads.
size();
7082 else if (MaxConsecutiveDistance < 2)
7087 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7088 Final, MaxConsecutiveDistance);
7090 OriginalLoads.size() == Loads.
size() &&
7091 MaxConsecutiveDistance == Loads.
size() &&
7096 VectorizedLoads.
clear();
7100 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7101 UnsortedNonVectorized, Final,
7102 OriginalLoads.size());
7103 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7104 SortedNonVectorized.
swap(UnsortedNonVectorized);
7105 Results.swap(UnsortedResults);
7110 << Slice.
size() <<
")\n");
7111 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7112 for (
Value *L : Slice)
7113 if (!getTreeEntry(L))
7114 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7120 unsigned MaxVF = Slice.size();
7121 unsigned UserMaxVF = 0;
7122 unsigned InterleaveFactor = 0;
7127 std::optional<unsigned> InterleavedLoadsDistance = 0;
7129 std::optional<unsigned> CommonVF = 0;
7133 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7134 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7137 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7139 if (*CommonVF == 0) {
7140 CommonVF =
E->Scalars.size();
7143 if (*CommonVF !=
E->Scalars.size())
7147 if (Pos !=
Idx && InterleavedLoadsDistance) {
7150 if (isa<Constant>(V))
7152 if (getTreeEntry(V))
7154 const auto &Nodes = ValueToGatherNodes.at(V);
7155 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7156 !is_contained(Slice, V);
7158 InterleavedLoadsDistance.reset();
7162 if (*InterleavedLoadsDistance == 0) {
7163 InterleavedLoadsDistance =
Idx - Pos;
7166 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7167 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7168 InterleavedLoadsDistance.reset();
7169 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7173 DeinterleavedNodes.
clear();
7175 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7176 CommonVF.value_or(0) != 0) {
7177 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7178 unsigned VF = *CommonVF;
7182 if (InterleaveFactor <= Slice.size() &&
7186 cast<LoadInst>(Slice.front())->getAlign(),
7187 cast<LoadInst>(Slice.front())
7191 UserMaxVF = InterleaveFactor * VF;
7193 InterleaveFactor = 0;
7198 unsigned ConsecutiveNodesSize = 0;
7199 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7200 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7201 [&, Slice = Slice](
const auto &
P) {
7203 return std::get<1>(
P).contains(V);
7205 if (It == Slice.end())
7208 VectorizableTree[std::get<0>(
P)]->Scalars;
7209 ConsecutiveNodesSize += VL.
size();
7210 unsigned Start = std::distance(Slice.begin(), It);
7211 unsigned Sz = Slice.size() - Start;
7212 return Sz < VL.
size() ||
7213 Slice.slice(std::distance(Slice.begin(), It),
7219 if (InterleaveFactor == 0 &&
7220 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7221 [&, Slice = Slice](
unsigned Idx) {
7223 SmallVector<Value *> PointerOps;
7224 return canVectorizeLoads(
7225 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7226 Slice[Idx * UserMaxVF], Order,
7228 LoadsState::ScatterVectorize;
7231 if (Slice.size() != ConsecutiveNodesSize)
7232 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7234 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7235 bool IsVectorized =
true;
7236 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7238 Slice.
slice(
I, std::min(VF,
E -
I));
7239 if (getTreeEntry(SubSlice.
front()))
7243 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7244 [&](
const auto &
P) {
7246 VectorizableTree[std::get<0>(
P)]
7251 unsigned Sz = VectorizableTree.size();
7252 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7253 if (Sz == VectorizableTree.size()) {
7254 IsVectorized =
false;
7257 if (InterleaveFactor > 0) {
7258 VF = 2 * (MaxVF / InterleaveFactor);
7259 InterleaveFactor = 0;
7268 NonVectorized.
append(SortedNonVectorized);
7270 return NonVectorized;
7272 for (
const auto &GLs : GatheredLoads) {
7273 const auto &
Ref = GLs.second;
7275 if (!
Ref.empty() && !NonVectorized.
empty() &&
7277 Ref.begin(),
Ref.end(), 0u,
7278 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
7279 return S + LoadsDists.size();
7280 }) != NonVectorized.
size() &&
7281 IsMaskedGatherSupported(NonVectorized)) {
7283 for (
LoadInst *LI : NonVectorized) {
7291 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7295 for (
unsigned Idx : LoadEntriesToVectorize) {
7296 const TreeEntry &
E = *VectorizableTree[
Idx];
7299 if (!
E.ReorderIndices.empty()) {
7306 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7310 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7311 VectorizableTree.size())
7312 GatheredLoadsEntriesFirst.reset();
7319 Value *NeedsScheduling =
nullptr;
7320 for (
Value *V : VL) {
7323 if (!NeedsScheduling) {
7324 NeedsScheduling = V;
7329 return NeedsScheduling;
7340 bool AllowAlternate) {
7344 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7347 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7352 if (isa<ExtractElementInst, UndefValue>(V))
7354 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7356 !isa<UndefValue>(EI->getIndexOperand()))
7359 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7362 if ((isa<BinaryOperator, CastInst>(
I)) &&
7372 : cast<CastInst>(
I)->getOperand(0)->getType()));
7374 if (isa<CastInst>(
I)) {
7375 std::pair<size_t, size_t> OpVals =
7381 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7383 if (CI->isCommutative())
7389 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7403 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7404 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7405 SubKey =
hash_value(Gep->getPointerOperand());
7409 !isa<ConstantInt>(
I->getOperand(1))) {
7417 return std::make_pair(Key, SubKey);
7427bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7429 unsigned Opcode0 = S.getOpcode();
7430 unsigned Opcode1 = S.getAltOpcode();
7434 Opcode0, Opcode1, OpcodeMask))
7437 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7440 for (
Value *V : VL) {
7441 if (isa<PoisonValue>(V)) {
7446 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7451 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7457 switch (Res.value_or(0)) {
7472 constexpr unsigned NumAltInsts = 3;
7473 unsigned NonInstCnt = 0;
7476 unsigned UndefCnt = 0;
7478 unsigned ExtraShuffleInsts = 0;
7487 return is_contained(Operands.back(), V);
7490 ++ExtraShuffleInsts;
7507 if (isa<Constant, ExtractElementInst>(V) ||
7508 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7509 if (isa<UndefValue>(V))
7515 if (!Res.second && Res.first->second == 1)
7516 ++ExtraShuffleInsts;
7517 ++Res.first->getSecond();
7518 if (
auto *
I = dyn_cast<Instruction>(V))
7519 UniqueOpcodes.
insert(
I->getOpcode());
7520 else if (Res.second)
7523 return none_of(Uniques, [&](
const auto &
P) {
7524 return P.first->hasNUsesOrMore(
P.second + 1) &&
7526 return getTreeEntry(U) || Uniques.contains(U);
7535 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7536 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7537 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7540BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7542 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7545 "Expected instructions with same/alternate opcodes only.");
7547 unsigned ShuffleOrOp =
7548 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7550 switch (ShuffleOrOp) {
7551 case Instruction::PHI: {
7554 return TreeEntry::NeedToGather;
7556 for (
Value *V : VL) {
7557 auto *
PHI = dyn_cast<PHINode>(V);
7562 if (Term &&
Term->isTerminator()) {
7564 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7565 return TreeEntry::NeedToGather;
7570 return TreeEntry::Vectorize;
7572 case Instruction::ExtractValue:
7573 case Instruction::ExtractElement: {
7574 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7577 return TreeEntry::NeedToGather;
7578 if (Reuse || !CurrentOrder.empty())
7579 return TreeEntry::Vectorize;
7581 return TreeEntry::NeedToGather;
7583 case Instruction::InsertElement: {
7587 for (
Value *V : VL) {
7588 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7590 "Non-constant or undef index?");
7594 return !SourceVectors.contains(V);
7597 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7598 "different source vectors.\n");
7599 return TreeEntry::NeedToGather;
7604 return SourceVectors.contains(V) && !
V->hasOneUse();
7607 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7608 "multiple uses.\n");
7609 return TreeEntry::NeedToGather;
7612 return TreeEntry::Vectorize;
7614 case Instruction::Load: {
7623 return TreeEntry::Vectorize;
7625 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7627 LoadEntriesToVectorize.insert(VectorizableTree.size());
7628 return TreeEntry::NeedToGather;
7630 return TreeEntry::ScatterVectorize;
7632 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7634 LoadEntriesToVectorize.insert(VectorizableTree.size());
7635 return TreeEntry::NeedToGather;
7637 return TreeEntry::StridedVectorize;
7641 if (
DL->getTypeSizeInBits(ScalarTy) !=
7642 DL->getTypeAllocSizeInBits(ScalarTy))
7643 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7645 auto *LI = dyn_cast<LoadInst>(V);
7646 return !LI || !LI->isSimple();
7653 return TreeEntry::NeedToGather;
7657 case Instruction::ZExt:
7658 case Instruction::SExt:
7659 case Instruction::FPToUI:
7660 case Instruction::FPToSI:
7661 case Instruction::FPExt:
7662 case Instruction::PtrToInt:
7663 case Instruction::IntToPtr:
7664 case Instruction::SIToFP:
7665 case Instruction::UIToFP:
7666 case Instruction::Trunc:
7667 case Instruction::FPTrunc:
7668 case Instruction::BitCast: {
7670 for (
Value *V : VL) {
7671 if (isa<PoisonValue>(V))
7673 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7676 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7677 return TreeEntry::NeedToGather;
7680 return TreeEntry::Vectorize;
7682 case Instruction::ICmp:
7683 case Instruction::FCmp: {
7688 for (
Value *V : VL) {
7689 if (isa<PoisonValue>(V))
7691 auto *
Cmp = cast<CmpInst>(V);
7692 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7693 Cmp->getOperand(0)->getType() != ComparedTy) {
7694 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7695 return TreeEntry::NeedToGather;
7698 return TreeEntry::Vectorize;
7700 case Instruction::Select:
7701 case Instruction::FNeg:
7702 case Instruction::Add:
7703 case Instruction::FAdd:
7704 case Instruction::Sub:
7705 case Instruction::FSub:
7706 case Instruction::Mul:
7707 case Instruction::FMul:
7708 case Instruction::UDiv:
7709 case Instruction::SDiv:
7710 case Instruction::FDiv:
7711 case Instruction::URem:
7712 case Instruction::SRem:
7713 case Instruction::FRem:
7714 case Instruction::Shl:
7715 case Instruction::LShr:
7716 case Instruction::AShr:
7717 case Instruction::And:
7718 case Instruction::Or:
7719 case Instruction::Xor:
7720 case Instruction::Freeze:
7721 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7723 auto *
I = dyn_cast<Instruction>(V);
7724 return I &&
I->isBinaryOp() && !
I->isFast();
7726 return TreeEntry::NeedToGather;
7727 return TreeEntry::Vectorize;
7728 case Instruction::GetElementPtr: {
7730 for (
Value *V : VL) {
7731 auto *
I = dyn_cast<GetElementPtrInst>(V);
7734 if (
I->getNumOperands() != 2) {
7735 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7736 return TreeEntry::NeedToGather;
7742 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7743 for (
Value *V : VL) {
7744 auto *
GEP = dyn_cast<GEPOperator>(V);
7747 Type *CurTy =
GEP->getSourceElementType();
7749 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7750 return TreeEntry::NeedToGather;
7756 for (
Value *V : VL) {
7757 auto *
I = dyn_cast<GetElementPtrInst>(V);
7760 auto *
Op =
I->getOperand(1);
7761 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7762 (
Op->getType() != Ty1 &&
7763 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7764 Op->getType()->getScalarSizeInBits() >
7765 DL->getIndexSizeInBits(
7766 V->getType()->getPointerAddressSpace())))) {
7768 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7769 return TreeEntry::NeedToGather;
7773 return TreeEntry::Vectorize;
7775 case Instruction::Store: {
7777 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7780 if (
DL->getTypeSizeInBits(ScalarTy) !=
7781 DL->getTypeAllocSizeInBits(ScalarTy)) {
7782 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7783 return TreeEntry::NeedToGather;
7787 for (
Value *V : VL) {
7788 auto *
SI = cast<StoreInst>(V);
7789 if (!
SI->isSimple()) {
7791 return TreeEntry::NeedToGather;
7800 if (CurrentOrder.empty()) {
7801 Ptr0 = PointerOps.
front();
7802 PtrN = PointerOps.
back();
7804 Ptr0 = PointerOps[CurrentOrder.front()];
7805 PtrN = PointerOps[CurrentOrder.back()];
7807 std::optional<int> Dist =
7810 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7811 return TreeEntry::Vectorize;
7815 return TreeEntry::NeedToGather;
7817 case Instruction::Call: {
7818 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7820 auto *
I = dyn_cast<Instruction>(V);
7821 return I && !
I->isFast();
7823 return TreeEntry::NeedToGather;
7826 CallInst *CI = cast<CallInst>(VL0);
7837 return TreeEntry::NeedToGather;
7842 for (
unsigned J = 0; J != NumArgs; ++J)
7845 for (
Value *V : VL) {
7846 CallInst *CI2 = dyn_cast<CallInst>(V);
7852 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7854 return TreeEntry::NeedToGather;
7858 for (
unsigned J = 0; J != NumArgs; ++J) {
7861 if (ScalarArgs[J] != A1J) {
7863 <<
"SLP: mismatched arguments in call:" << *CI
7864 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7865 return TreeEntry::NeedToGather;
7874 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7875 <<
"!=" << *V <<
'\n');
7876 return TreeEntry::NeedToGather;
7880 return TreeEntry::Vectorize;
7882 case Instruction::ShuffleVector: {
7883 if (!S.isAltShuffle()) {
7886 return TreeEntry::Vectorize;
7889 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7890 return TreeEntry::NeedToGather;
7895 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7896 "the whole alt sequence is not profitable.\n");
7897 return TreeEntry::NeedToGather;
7900 return TreeEntry::Vectorize;
7904 return TreeEntry::NeedToGather;
7918 PHIHandler() =
delete;
7920 : DT(DT), Main(Main), Phis(Phis),
7921 Operands(Main->getNumIncomingValues(),
7923 void buildOperands() {
7924 constexpr unsigned FastLimit = 4;
7934 auto *
P = dyn_cast<PHINode>(V);
7936 assert(isa<PoisonValue>(V) &&
7937 "Expected isa instruction or poison value.");
7941 if (
P->getIncomingBlock(
I) == InBB)
7956 Blocks.try_emplace(InBB).first->second.push_back(
I);
7959 if (isa<PoisonValue>(V)) {
7964 auto *
P = cast<PHINode>(V);
7965 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
7973 auto It =
Blocks.find(InBB);
7979 for (
const auto &
P :
Blocks) {
7980 if (
P.getSecond().size() <= 1)
7982 unsigned BasicI =
P.getSecond().front();
7985 [&](
const auto &Data) {
7986 return !Data.value() ||
7987 Data.value() ==
Operands[BasicI][Data.index()];
7989 "Expected empty operands list.");
7999 const EdgeInfo &UserTreeIdx,
8000 unsigned InterleaveFactor) {
8006 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8007 bool DoNotFail =
false) {
8010 for (
Value *V : VL) {
8017 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8022 size_t NumUniqueScalarValues = UniqueValues.
size();
8025 if (NumUniqueScalarValues == VL.size() &&
8027 ReuseShuffleIndices.
clear();
8030 if ((UserTreeIdx.UserTE &&
8031 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8033 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8034 "for nodes with padding.\n");
8035 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8039 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8040 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8043 if (DoNotFail && UniquePositions.size() > 1 &&
8044 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8045 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8048 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8049 if (PWSz == VL.size()) {
8050 ReuseShuffleIndices.
clear();
8052 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8054 PWSz - UniqueValues.
size(),
8056 VL = NonUniqueValueVL;
8061 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8074 if (S.getMainOp() &&
8075 isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8077 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8082 if (S.getOpcode()) {
8083 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8084 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp()
8086 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8087 auto It = MultiNodeScalars.
find(S.getMainOp());
8088 if (It != MultiNodeScalars.
end()) {
8089 auto *TEIt =
find_if(It->getSecond(),
8090 [&](TreeEntry *ME) { return ME->isSame(VL); });
8091 if (TEIt != It->getSecond().end())
8101 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
8102 if (TryToFindDuplicates(S))
8103 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8104 ReuseShuffleIndices);
8108 Nodes.
insert(getTreeEntry(S.getMainOp()));
8109 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8112 if (
any_of(Nodes, [&](
const TreeEntry *E) {
8114 [&](
Value *V) { return Values.contains(V); }))
8119 all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8122 if (TryToFindDuplicates(S))
8123 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8124 ReuseShuffleIndices);
8131 E->UserTreeIndices.push_back(UserTreeIdx);
8132 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8143 !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 &&
8148 cast<Instruction>(
I)->getOpcode() ==
8149 S.getMainOp()->getOpcode();
8151 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8152 if (TryToFindDuplicates(S))
8153 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8154 ReuseShuffleIndices);
8159 if (S.getOpcode() == Instruction::ExtractElement &&
8160 isa<ScalableVectorType>(
8161 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8162 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8163 if (TryToFindDuplicates(S))
8164 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8165 ReuseShuffleIndices);
8172 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8181 auto &&NotProfitableForVectorization = [&S,
this,
8183 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
8192 for (
Value *V : VL) {
8193 auto *
I = cast<Instruction>(V);
8195 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8198 bool IsCommutative =
8200 if ((IsCommutative &&
8201 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8203 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8205 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8207 auto *
I1 = cast<Instruction>(VL.front());
8208 auto *I2 = cast<Instruction>(VL.back());
8209 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8211 I2->getOperand(
Op));
8212 if (
static_cast<unsigned>(
count_if(
8213 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8215 })) >= S.getMainOp()->getNumOperands() / 2)
8217 if (S.getMainOp()->getNumOperands() > 2)
8219 if (IsCommutative) {
8224 I2->getOperand((
Op + 1) % E));
8226 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8235 bool IsScatterVectorizeUserTE =
8236 UserTreeIdx.UserTE &&
8237 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8238 bool AreAllSameBlock = S.getOpcode() &&
allSameBlock(VL);
8239 bool AreScatterAllGEPSameBlock =
8240 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8244 auto *
I = dyn_cast<GetElementPtrInst>(V);
8248 BB =
I->getParent();
8249 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8252 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8254 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8256 (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8259 NotProfitableForVectorization(VL)) {
8260 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8261 if (TryToFindDuplicates(S))
8262 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8263 ReuseShuffleIndices);
8268 if (S.getOpcode() && !EphValues.
empty()) {
8269 for (
Value *V : VL) {
8270 if (EphValues.
count(V)) {
8272 <<
") is ephemeral.\n");
8273 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8283 for (
Value *V : VL) {
8284 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8287 if (getTreeEntry(V)) {
8289 <<
") is already in tree.\n");
8290 if (TryToFindDuplicates(S))
8291 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8292 ReuseShuffleIndices);
8298 if (UserIgnoreList && !UserIgnoreList->empty()) {
8299 for (
Value *V : VL) {
8300 if (UserIgnoreList->contains(V)) {
8301 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8302 if (TryToFindDuplicates(S))
8303 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8304 ReuseShuffleIndices);
8312 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8313 assert(VL.front()->getType()->isPointerTy() &&
8314 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8315 "Expected pointers only.");
8317 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8318 assert(It != VL.end() &&
"Expected at least one GEP.");
8327 if (S.getMainOp() &&
8335 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8340 if (!TryToFindDuplicates(S,
true))
8346 TreeEntry::EntryState State = getScalarsVectorizationState(
8347 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8348 if (State == TreeEntry::NeedToGather) {
8349 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8350 ReuseShuffleIndices);
8354 auto &BSRef = BlocksSchedules[BB];
8356 BSRef = std::make_unique<BlockScheduling>(BB);
8358 BlockScheduling &BS = *BSRef;
8360 std::optional<ScheduleData *> Bundle =
8361 BS.tryScheduleBundle(UniqueValues,
this, S);
8362#ifdef EXPENSIVE_CHECKS
8367 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8368 assert((!BS.getScheduleData(VL0) ||
8369 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8370 "tryScheduleBundle should cancelScheduling on failure");
8371 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8372 ReuseShuffleIndices);
8373 NonScheduledFirst.insert(VL.front());
8374 if (S.getOpcode() == Instruction::Load &&
8375 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8379 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8381 unsigned ShuffleOrOp = S.isAltShuffle() ?
8382 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
8383 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8386 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8391 if (S.getOpcode() != Instruction::PHI || S.isAltShuffle())
8396 for (
unsigned I : PHIOps)
8399 switch (ShuffleOrOp) {
8400 case Instruction::PHI: {
8401 auto *PH = cast<PHINode>(VL0);
8404 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8409 PHIHandler Handler(*DT, PH, VL);
8410 Handler.buildOperands();
8411 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8412 TE->setOperand(
I, Handler.getOperands(
I));
8414 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8419 case Instruction::ExtractValue:
8420 case Instruction::ExtractElement: {
8421 if (CurrentOrder.empty()) {
8422 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8425 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8427 for (
unsigned Idx : CurrentOrder)
8435 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8436 ReuseShuffleIndices, CurrentOrder);
8438 "(ExtractValueInst/ExtractElementInst).\n";
8442 TE->setOperand(*
this);
8445 case Instruction::InsertElement: {
8446 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8448 auto OrdCompare = [](
const std::pair<int, int> &P1,
8449 const std::pair<int, int> &P2) {
8450 return P1.first > P2.first;
8453 decltype(OrdCompare)>
8454 Indices(OrdCompare);
8455 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8457 Indices.emplace(
Idx,
I);
8459 OrdersType CurrentOrder(VL.size(), VL.size());
8460 bool IsIdentity =
true;
8461 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8462 CurrentOrder[Indices.top().second] =
I;
8463 IsIdentity &= Indices.top().second ==
I;
8467 CurrentOrder.clear();
8468 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8470 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8473 TE->setOperand(*
this);
8474 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8477 case Instruction::Load: {
8484 TreeEntry *
TE =
nullptr;
8487 case TreeEntry::Vectorize:
8488 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8489 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8490 if (CurrentOrder.empty())
8495 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8498 case TreeEntry::StridedVectorize:
8500 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8501 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8502 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8505 case TreeEntry::ScatterVectorize:
8507 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8508 UserTreeIdx, ReuseShuffleIndices);
8511 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8514 case TreeEntry::CombinedVectorize:
8515 case TreeEntry::NeedToGather:
8518 TE->setOperand(*
this);
8519 if (State == TreeEntry::ScatterVectorize)
8520 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8523 case Instruction::ZExt:
8524 case Instruction::SExt:
8525 case Instruction::FPToUI:
8526 case Instruction::FPToSI:
8527 case Instruction::FPExt:
8528 case Instruction::PtrToInt:
8529 case Instruction::IntToPtr:
8530 case Instruction::SIToFP:
8531 case Instruction::UIToFP:
8532 case Instruction::Trunc:
8533 case Instruction::FPTrunc:
8534 case Instruction::BitCast: {
8535 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8536 std::make_pair(std::numeric_limits<unsigned>::min(),
8537 std::numeric_limits<unsigned>::max()));
8538 if (ShuffleOrOp == Instruction::ZExt ||
8539 ShuffleOrOp == Instruction::SExt) {
8540 CastMaxMinBWSizes = std::make_pair(
8546 }
else if (ShuffleOrOp == Instruction::Trunc) {
8547 CastMaxMinBWSizes = std::make_pair(
8554 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8555 ReuseShuffleIndices);
8559 TE->setOperand(*
this);
8561 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8562 if (ShuffleOrOp == Instruction::Trunc) {
8563 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8564 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8565 ShuffleOrOp == Instruction::UIToFP) {
8566 unsigned NumSignBits =
8568 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8570 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8572 if (NumSignBits * 2 >=
8574 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8578 case Instruction::ICmp:
8579 case Instruction::FCmp: {
8582 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8583 ReuseShuffleIndices);
8588 VLOperands Ops(VL, VL0, *
this);
8593 "Commutative Predicate mismatch");
8595 Left = Ops.getVL(0);
8596 Right = Ops.getVL(1);
8599 for (
Value *V : VL) {
8600 if (isa<PoisonValue>(V)) {
8605 auto *
Cmp = cast<CmpInst>(V);
8608 if (
Cmp->getPredicate() != P0)
8610 Left.push_back(LHS);
8611 Right.push_back(RHS);
8618 if (ShuffleOrOp == Instruction::ICmp) {
8619 unsigned NumSignBits0 =
8621 if (NumSignBits0 * 2 >=
8623 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8624 unsigned NumSignBits1 =
8626 if (NumSignBits1 * 2 >=
8628 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8632 case Instruction::Select:
8633 case Instruction::FNeg:
8634 case Instruction::Add:
8635 case Instruction::FAdd:
8636 case Instruction::Sub:
8637 case Instruction::FSub:
8638 case Instruction::Mul:
8639 case Instruction::FMul:
8640 case Instruction::UDiv:
8641 case Instruction::SDiv:
8642 case Instruction::FDiv:
8643 case Instruction::URem:
8644 case Instruction::SRem:
8645 case Instruction::FRem:
8646 case Instruction::Shl:
8647 case Instruction::LShr:
8648 case Instruction::AShr:
8649 case Instruction::And:
8650 case Instruction::Or:
8651 case Instruction::Xor:
8652 case Instruction::Freeze: {
8653 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8654 ReuseShuffleIndices);
8656 dbgs() <<
"SLP: added a new TreeEntry "
8657 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8660 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8662 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8665 case Instruction::GetElementPtr: {
8666 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8667 ReuseShuffleIndices);
8668 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8672 for (
Value *V : VL) {
8673 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8678 Operands.front().push_back(
GEP->getPointerOperand());
8689 [VL0Ty, IndexIdx](
Value *V) {
8690 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8693 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8697 ->getPointerOperandType()
8700 for (
Value *V : VL) {
8701 auto *
I = dyn_cast<GetElementPtrInst>(V);
8704 ConstantInt::get(Ty, 0,
false));
8707 auto *
Op =
I->getOperand(IndexIdx);
8708 auto *CI = dyn_cast<ConstantInt>(
Op);
8713 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8717 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8721 case Instruction::Store: {
8722 bool Consecutive = CurrentOrder.empty();
8725 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8726 ReuseShuffleIndices, CurrentOrder);
8728 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8732 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8734 TE->setOperand(*
this);
8735 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8738 case Instruction::Call: {
8741 CallInst *CI = cast<CallInst>(VL0);
8744 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8745 ReuseShuffleIndices);
8749 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8754 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8758 case Instruction::ShuffleVector: {
8759 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8760 ReuseShuffleIndices);
8761 if (S.isAltShuffle()) {
8762 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8767 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8772 auto *CI = dyn_cast<CmpInst>(VL0);
8774 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8776 auto *MainCI = cast<CmpInst>(S.getMainOp());
8777 auto *AltCI = cast<CmpInst>(S.getAltOp());
8781 "Expected different main/alternate predicates.");
8785 for (
Value *V : VL) {
8786 if (isa<PoisonValue>(V)) {
8791 auto *
Cmp = cast<CmpInst>(V);
8802 Left.push_back(LHS);
8803 Right.push_back(RHS);
8812 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8814 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8827 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8830 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8832 for (
const auto *Ty : ST->elements())
8833 if (Ty != *ST->element_begin())
8835 N *= ST->getNumElements();
8836 EltTy = *ST->element_begin();
8837 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8838 N *= AT->getNumElements();
8839 EltTy = AT->getElementType();
8841 auto *VT = cast<FixedVectorType>(EltTy);
8842 N *= VT->getNumElements();
8843 EltTy = VT->getElementType();
8850 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8858 bool ResizeAllowed)
const {
8859 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8860 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8861 auto *E0 = cast<Instruction>(*It);
8863 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8867 Value *Vec = E0->getOperand(0);
8869 CurrentOrder.
clear();
8873 if (E0->getOpcode() == Instruction::ExtractValue) {
8878 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8882 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8885 unsigned E = VL.
size();
8886 if (!ResizeAllowed && NElts != E)
8889 unsigned MinIdx = NElts, MaxIdx = 0;
8891 auto *Inst = dyn_cast<Instruction>(V);
8894 if (Inst->getOperand(0) != Vec)
8896 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8897 if (isa<UndefValue>(EE->getIndexOperand()))
8902 const unsigned ExtIdx = *
Idx;
8903 if (ExtIdx >= NElts)
8905 Indices[
I] = ExtIdx;
8906 if (MinIdx > ExtIdx)
8908 if (MaxIdx < ExtIdx)
8911 if (MaxIdx - MinIdx + 1 > E)
8913 if (MaxIdx + 1 <= E)
8917 bool ShouldKeepOrder =
true;
8923 CurrentOrder.
assign(E, E);
8924 for (
unsigned I = 0;
I < E; ++
I) {
8927 const unsigned ExtIdx = Indices[
I] - MinIdx;
8928 if (CurrentOrder[ExtIdx] != E) {
8929 CurrentOrder.
clear();
8932 ShouldKeepOrder &= ExtIdx ==
I;
8933 CurrentOrder[ExtIdx] =
I;
8935 if (ShouldKeepOrder)
8936 CurrentOrder.
clear();
8938 return ShouldKeepOrder;
8941bool BoUpSLP::areAllUsersVectorized(
8943 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
8945 return ScalarToTreeEntry.contains(U) ||
8946 isVectorLikeInstWithConstOps(U) ||
8947 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8951static std::pair<InstructionCost, InstructionCost>
8959 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
8960 FMF = FPCI->getFastMathFlags();
8963 dyn_cast<IntrinsicInst>(CI));
8964 auto IntrinsicCost =
8971 auto LibCost = IntrinsicCost;
8978 return {IntrinsicCost, LibCost};
8981void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8985 unsigned Sz = Scalars.size();
8988 if (!ReorderIndices.empty())
8990 for (
unsigned I = 0;
I < Sz; ++
I) {
8992 if (!ReorderIndices.empty())
8994 if (isa<PoisonValue>(Scalars[
Idx]))
8996 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
8997 if (IsAltOp(OpInst)) {
9007 if (!ReuseShuffleIndices.
empty()) {
9010 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9020 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9021 auto *AltCI = cast<CmpInst>(AltOp);
9024 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9025 auto *CI = cast<CmpInst>(
I);
9033 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9034 "CmpInst expected to match either main or alternate predicate or "
9037 return MainP !=
P && MainP != SwappedP;
9044 const auto *Op0 = Ops.
front();
9050 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9054 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9056 if (
auto *CI = dyn_cast<ConstantInt>(V))
9057 return CI->getValue().isPowerOf2();
9060 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9062 if (
auto *CI = dyn_cast<ConstantInt>(V))
9063 return CI->getValue().isNegatedPowerOf2();
9068 if (IsConstant && IsUniform)
9070 else if (IsConstant)
9084class BaseShuffleAnalysis {
9086 Type *ScalarTy =
nullptr;
9088 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9096 unsigned getVF(
Value *V)
const {
9097 assert(V &&
"V cannot be nullptr");
9098 assert(isa<FixedVectorType>(
V->getType()) &&
9099 "V does not have FixedVectorType");
9100 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9102 unsigned VNumElements =
9103 cast<FixedVectorType>(
V->getType())->getNumElements();
9104 assert(VNumElements > ScalarTyNumElements &&
9105 "the number of elements of V is not large enough");
9106 assert(VNumElements % ScalarTyNumElements == 0 &&
9107 "the number of elements of V is not a vectorized value");
9108 return VNumElements / ScalarTyNumElements;
9116 int Limit =
Mask.size();
9128 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9144 unsigned VF =
Mask.size();
9146 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9149 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9190 bool SinglePermute) {
9194 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9196 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9202 if (isIdentityMask(Mask, SVTy,
false)) {
9203 if (!IdentityOp || !SinglePermute ||
9204 (isIdentityMask(Mask, SVTy,
true) &&
9206 IdentityMask.
size()))) {
9211 IdentityMask.
assign(Mask);
9231 if (SV->isZeroEltSplat()) {
9233 IdentityMask.
assign(Mask);
9235 int LocalVF =
Mask.size();
9237 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9238 LocalVF = SVOpTy->getNumElements();
9242 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9244 ExtMask[
Idx] = SV->getMaskValue(
I);
9254 if (!IsOp1Undef && !IsOp2Undef) {
9256 for (
int &
I : Mask) {
9259 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9266 combineMasks(LocalVF, ShuffleMask, Mask);
9267 Mask.swap(ShuffleMask);
9269 Op = SV->getOperand(0);
9271 Op = SV->getOperand(1);
9273 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9274 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9279 "Expected masks of same sizes.");
9284 Mask.swap(IdentityMask);
9285 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9286 return SinglePermute &&
9287 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9289 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9290 Shuffle->isZeroEltSplat() &&
9303 template <
typename T,
typename ShuffleBuilderTy>
9305 ShuffleBuilderTy &Builder) {
9306 assert(V1 &&
"Expected at least one vector value.");
9308 Builder.resizeToMatch(V1, V2);
9309 int VF =
Mask.size();
9310 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9311 VF = FTy->getNumElements();
9312 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9319 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9322 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9324 CombinedMask1[
I] =
Mask[
I];
9326 CombinedMask2[
I] =
Mask[
I] - VF;
9333 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9334 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9337 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9338 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9343 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9346 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9348 ExtMask1, UseMask::SecondArg);
9353 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9356 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9358 ExtMask2, UseMask::SecondArg);
9359 if (SV1->getOperand(0)->getType() ==
9360 SV2->getOperand(0)->getType() &&
9361 SV1->getOperand(0)->getType() != SV1->getType() &&
9364 Op1 = SV1->getOperand(0);
9365 Op2 = SV2->getOperand(0);
9367 int LocalVF = ShuffleMask1.size();
9368 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9369 LocalVF = FTy->getNumElements();
9370 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9371 CombinedMask1.swap(ShuffleMask1);
9373 LocalVF = ShuffleMask2.size();
9374 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9375 LocalVF = FTy->getNumElements();
9376 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9377 CombinedMask2.swap(ShuffleMask2);
9380 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9381 Builder.resizeToMatch(Op1, Op2);
9382 VF = std::max(cast<VectorType>(Op1->
getType())
9384 .getKnownMinValue(),
9385 cast<VectorType>(Op2->
getType())
9387 .getKnownMinValue());
9388 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9391 "Expected undefined mask element");
9392 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9398 isa<ShuffleVectorInst>(Op1) &&
9399 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9401 return Builder.createIdentity(Op1);
9402 return Builder.createShuffleVector(
9406 if (isa<PoisonValue>(V1))
9407 return Builder.createPoison(
9408 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9410 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9411 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9414 return Builder.createShuffleVector(V1, NewMask);
9415 return Builder.createIdentity(V1);
9421static std::pair<InstructionCost, InstructionCost>
9432 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9442 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9446 for (
Value *V : Ptrs) {
9451 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9456 if (!
Ptr || !
Ptr->hasOneUse())
9460 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9466 TTI::PointersChainInfo::getKnownStride(),
9476 [](
const Value *V) {
9477 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9478 return Ptr && !
Ptr->hasAllConstantIndices();
9480 ? TTI::PointersChainInfo::getUnknownStride()
9481 : TTI::PointersChainInfo::getKnownStride();
9485 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9487 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9488 if (It != Ptrs.
end())
9489 BaseGEP = cast<GEPOperator>(*It);
9494 BaseGEP->getPointerOperand(), Indices, VecTy,
9499 return std::make_pair(ScalarCost, VecCost);
9502void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9503 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9504 "Expected gather node without reordering.");
9510 if (
TE.Scalars.size() == 2 || (
TE.getOpcode() && !
TE.isAltShuffle()) ||
9514 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9515 return VectorizableTree[Idx]->isSame(TE.Scalars);
9519 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9524 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9525 if (LIt != LoadsMap.
end()) {
9526 for (
LoadInst *RLI : LIt->second) {
9532 for (
LoadInst *RLI : LIt->second) {
9539 if (LIt->second.size() > 2) {
9541 hash_value(LIt->second.back()->getPointerOperand());
9547 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9552 bool IsOrdered =
true;
9553 unsigned NumInstructions = 0;
9558 if (
auto *Inst = dyn_cast<Instruction>(V);
9559 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9565 auto &Container = SortedValues[
Key];
9566 if (IsOrdered && !KeyToIndex.
contains(V) &&
9567 !(isa<Constant, ExtractElementInst>(V) ||
9569 ((Container.contains(
Idx) &&
9570 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9571 (!Container.empty() && !Container.contains(
Idx) &&
9572 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9574 auto &KTI = KeyToIndex[
V];
9576 Container[
Idx].push_back(V);
9581 if (!IsOrdered && NumInstructions > 1) {
9583 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9584 for (
const auto &
D : SortedValues) {
9585 for (
const auto &
P :
D.second) {
9587 for (
Value *V :
P.second) {
9590 TE.ReorderIndices[Cnt +
K] =
Idx;
9591 TE.Scalars[Cnt +
K] =
V;
9593 Sz += Indices.
size();
9594 Cnt += Indices.
size();
9596 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9598 *
TTI,
TE.Scalars.front()->getType(), Sz);
9600 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9602 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9603 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9610 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9615 auto *ScalarTy =
TE.Scalars.front()->getType();
9617 for (
auto [
Idx, Sz] : SubVectors) {
9621 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9626 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9627 if (DemandedElts[
I])
9630 CostKind,
I * ScalarTyNumElements, FTy);
9635 int Sz =
TE.Scalars.size();
9637 TE.ReorderIndices.end());
9638 for (
unsigned I : seq<unsigned>(Sz)) {
9640 if (isa<PoisonValue>(V)) {
9643 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9647 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9650 VecTy, ReorderMask);
9653 for (
unsigned I : seq<unsigned>(Sz)) {
9657 if (!isa<PoisonValue>(V))
9660 ReorderMask[
I] =
I + Sz;
9664 VecTy, DemandedElts,
true,
false,
CostKind);
9667 if (
Cost >= BVCost) {
9670 TE.ReorderIndices.clear();
9676 BaseGraphSize = VectorizableTree.size();
9678 class GraphTransformModeRAAI {
9679 bool &SavedIsGraphTransformMode;
9682 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9683 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9684 IsGraphTransformMode =
true;
9686 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9687 } TransformContext(IsGraphTransformMode);
9696 const InstructionsState &S) {
9698 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9700 I2->getOperand(
Op));
9702 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9704 [](
const std::pair<Value *, Value *> &
P) {
9705 return isa<Constant>(
P.first) ||
9706 isa<Constant>(
P.second) ||
P.first ==
P.second;
9713 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9714 TreeEntry &E = *VectorizableTree[
Idx];
9716 reorderGatherNode(E);
9720 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9721 TreeEntry &E = *VectorizableTree[
Idx];
9728 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9729 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9735 unsigned StartIdx = 0;
9740 *
TTI, VL.
front()->getType(), VF - 1)) {
9741 if (StartIdx + VF >
End)
9744 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9748 if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9749 SE || getTreeEntry(Slice.
back())) {
9752 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9760 bool IsSplat =
isSplat(Slice);
9761 if (Slices.
empty() || !IsSplat ||
9763 Slice.
front()->getType(), VF)),
9766 Slice.
front()->getType(), 2 * VF)),
9769 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9774 if (!S.getOpcode() || S.isAltShuffle() || !
allSameBlock(Slice) ||
9775 (S.getOpcode() == Instruction::Load &&
9782 if ((!UserIgnoreList || E.Idx != 0) &&
9786 if (isa<PoisonValue>(V))
9788 return areAllUsersVectorized(cast<Instruction>(V),
9792 if (S.getOpcode() == Instruction::Load) {
9804 if (UserIgnoreList && E.Idx == 0)
9809 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9812 !CheckOperandsProfitability(
9815 IsaPred<Instruction>)),
9826 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9827 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9828 if (StartIdx == Cnt)
9829 StartIdx = Cnt + Sz;
9830 if (
End == Cnt + Sz)
9833 for (
auto [Cnt, Sz] : Slices) {
9836 if (TreeEntry *SE = getTreeEntry(Slice.
front());
9837 SE || getTreeEntry(Slice.
back())) {
9840 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9842 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9843 AddCombinedNode(SE->Idx, Cnt, Sz);
9846 unsigned PrevSize = VectorizableTree.size();
9847 [[maybe_unused]]
unsigned PrevEntriesSize =
9848 LoadEntriesToVectorize.size();
9849 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9850 if (PrevSize + 1 == VectorizableTree.size() &&
9851 VectorizableTree[PrevSize]->isGather() &&
9852 VectorizableTree[PrevSize]->getOpcode() !=
9853 Instruction::ExtractElement &&
9855 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9857 VectorizableTree.pop_back();
9858 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9859 "LoadEntriesToVectorize expected to remain the same");
9862 AddCombinedNode(PrevSize, Cnt, Sz);
9866 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9869 E.ReorderIndices.clear();
9872 switch (E.getOpcode()) {
9873 case Instruction::Load: {
9876 if (E.State != TreeEntry::Vectorize)
9878 Type *ScalarTy = E.getMainOp()->getType();
9880 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9883 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9887 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9894 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9895 false, CommonAlignment,
CostKind, BaseLI);
9896 if (StridedCost < OriginalVecCost)
9899 E.State = TreeEntry::StridedVectorize;
9903 case Instruction::Store: {
9905 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9907 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9910 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9914 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9921 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9922 false, CommonAlignment,
CostKind, BaseSI);
9923 if (StridedCost < OriginalVecCost)
9926 E.State = TreeEntry::StridedVectorize;
9927 }
else if (!E.ReorderIndices.empty()) {
9930 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9931 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
9932 if (Mask.size() < 4)
9934 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9938 VecTy, Factor, BaseSI->getAlign(),
9946 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9947 if (InterleaveFactor != 0)
9948 E.setInterleave(InterleaveFactor);
9952 case Instruction::Select: {
9953 if (E.State != TreeEntry::Vectorize)
9959 E.CombinedOp = TreeEntry::MinMax;
9960 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
9961 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9962 CondEntry->State == TreeEntry::Vectorize) {
9964 CondEntry->State = TreeEntry::CombinedVectorize;
9973 if (LoadEntriesToVectorize.empty()) {
9975 if (VectorizableTree.size() <= 1 &&
9976 VectorizableTree.front()->getOpcode() == Instruction::Load)
9979 constexpr unsigned SmallTree = 3;
9980 constexpr unsigned SmallVF = 2;
9981 if ((VectorizableTree.size() <= SmallTree &&
9982 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9983 (VectorizableTree.size() <= 2 && UserIgnoreList))
9986 if (VectorizableTree.front()->isNonPowOf2Vec() &&
9990 [](
const std::unique_ptr<TreeEntry> &TE) {
9991 return TE->isGather() &&
9992 TE->getOpcode() == Instruction::Load &&
10004 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10005 TreeEntry &E = *TE;
10006 if (E.isGather() &&
10007 (E.getOpcode() == Instruction::Load ||
10008 (!E.getOpcode() &&
any_of(E.Scalars,
10010 return isa<LoadInst>(V) &&
10011 !isVectorized(V) &&
10012 !isDeleted(cast<Instruction>(V));
10015 for (
Value *V : E.Scalars) {
10016 auto *LI = dyn_cast<LoadInst>(V);
10022 *
this, V, *DL, *SE, *
TTI,
10023 GatheredLoads[std::make_tuple(
10031 if (!GatheredLoads.
empty())
10032 tryToVectorizeGatheredLoads(GatheredLoads);
10042 bool IsFinalized =
false;
10055 bool SameNodesEstimated =
true;
10064 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10080 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10081 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10084 count(VL, *It) > 1 &&
10086 if (!NeedShuffle) {
10087 if (isa<FixedVectorType>(ScalarTy)) {
10092 cast<FixedVectorType>(ScalarTy));
10095 CostKind, std::distance(VL.
begin(), It),
10101 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10108 VecTy, ShuffleMask, CostKind,
10112 return GatherCost +
10113 (
all_of(Gathers, IsaPred<UndefValue>)
10115 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10123 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10124 unsigned NumParts) {
10125 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10127 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10128 auto *EE = dyn_cast<ExtractElementInst>(V);
10131 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10134 return std::max(Sz, VecTy->getNumElements());
10140 -> std::optional<TTI::ShuffleKind> {
10141 if (NumElts <= EltsPerVector)
10142 return std::nullopt;
10144 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10146 if (I == PoisonMaskElem)
10148 return std::min(S, I);
10151 int OffsetReg1 = OffsetReg0;
10155 int FirstRegId = -1;
10156 Indices.assign(1, OffsetReg0);
10160 int Idx =
I - OffsetReg0;
10162 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10163 if (FirstRegId < 0)
10164 FirstRegId = RegId;
10165 RegIndices.
insert(RegId);
10166 if (RegIndices.
size() > 2)
10167 return std::nullopt;
10168 if (RegIndices.
size() == 2) {
10170 if (Indices.
size() == 1) {
10173 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10174 [&](
int S,
int I) {
10175 if (I == PoisonMaskElem)
10177 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10178 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10179 if (RegId == FirstRegId)
10181 return std::min(S, I);
10184 Indices.push_back(OffsetReg1 % NumElts);
10186 Idx =
I - OffsetReg1;
10188 I = (
Idx % NumElts) % EltsPerVector +
10189 (RegId == FirstRegId ? 0 : EltsPerVector);
10191 return ShuffleKind;
10198 for (
unsigned Part : seq<unsigned>(NumParts)) {
10199 if (!ShuffleKinds[Part])
10202 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10206 std::optional<TTI::ShuffleKind> RegShuffleKind =
10207 CheckPerRegistersShuffle(SubMask, Indices);
10208 if (!RegShuffleKind) {
10211 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10224 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10225 for (
unsigned Idx : Indices) {
10226 assert((
Idx + EltsPerVector) <= BaseVF &&
10227 "SK_ExtractSubvector index out of range");
10238 if (OriginalCost <
Cost)
10239 Cost = OriginalCost;
10247 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10254 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10256 unsigned SliceSize) {
10257 if (SameNodesEstimated) {
10263 if ((InVectors.
size() == 2 &&
10264 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10265 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10266 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10267 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10270 "Expected all poisoned elements.");
10272 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10277 Cost += createShuffle(InVectors.
front(),
10278 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10280 transformMaskAfterShuffle(CommonMask, CommonMask);
10281 }
else if (InVectors.
size() == 2) {
10282 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10283 transformMaskAfterShuffle(CommonMask, CommonMask);
10285 SameNodesEstimated =
false;
10286 if (!E2 && InVectors.
size() == 1) {
10287 unsigned VF = E1.getVectorFactor();
10290 cast<FixedVectorType>(V1->
getType())->getNumElements());
10292 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10293 VF = std::max(VF, E->getVectorFactor());
10295 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10297 CommonMask[
Idx] = Mask[
Idx] + VF;
10298 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10299 transformMaskAfterShuffle(CommonMask, CommonMask);
10301 auto P = InVectors.
front();
10302 Cost += createShuffle(&E1, E2, Mask);
10303 unsigned VF = Mask.size();
10308 const auto *E = cast<const TreeEntry *>(
P);
10309 VF = std::max(VF, E->getVectorFactor());
10311 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10313 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10314 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10315 transformMaskAfterShuffle(CommonMask, CommonMask);
10319 class ShuffleCostBuilder {
10322 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10324 return Mask.empty() ||
10325 (VF == Mask.size() &&
10333 ~ShuffleCostBuilder() =
default;
10338 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10339 if (isEmptyOrIdentity(Mask, VF))
10342 cast<VectorType>(V1->
getType()), Mask);
10347 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10348 if (isEmptyOrIdentity(Mask, VF))
10351 cast<VectorType>(V1->
getType()), Mask);
10357 void resizeToMatch(
Value *&,
Value *&)
const {}
10367 ShuffleCostBuilder Builder(
TTI);
10370 unsigned CommonVF = Mask.size();
10372 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10376 Type *EScalarTy = E.Scalars.front()->getType();
10377 bool IsSigned =
true;
10378 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10380 IsSigned = It->second.second;
10382 if (EScalarTy != ScalarTy) {
10383 unsigned CastOpcode = Instruction::Trunc;
10384 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10385 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10387 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10395 if (isa<Constant>(V))
10397 auto *VecTy = cast<VectorType>(V->getType());
10399 if (EScalarTy != ScalarTy) {
10401 unsigned CastOpcode = Instruction::Trunc;
10402 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10403 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10405 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10412 if (!V1 && !V2 && !P2.
isNull()) {
10414 const TreeEntry *E = cast<const TreeEntry *>(P1);
10415 unsigned VF = E->getVectorFactor();
10416 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10417 CommonVF = std::max(VF, E2->getVectorFactor());
10420 return Idx < 2 * static_cast<int>(CommonVF);
10422 "All elements in mask must be less than 2 * CommonVF.");
10423 if (E->Scalars.size() == E2->Scalars.size()) {
10427 for (
int &
Idx : CommonMask) {
10430 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10432 else if (
Idx >=
static_cast<int>(CommonVF))
10433 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10437 CommonVF = E->Scalars.size();
10438 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10439 GetNodeMinBWAffectedCost(*E2, CommonVF);
10441 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10442 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10445 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10446 }
else if (!V1 && P2.
isNull()) {
10448 const TreeEntry *E = cast<const TreeEntry *>(P1);
10449 unsigned VF = E->getVectorFactor();
10453 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10454 "All elements in mask must be less than CommonVF.");
10455 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10457 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10458 for (
int &
Idx : CommonMask) {
10462 CommonVF = E->Scalars.size();
10463 }
else if (
unsigned Factor = E->getInterleaveFactor();
10464 Factor > 0 && E->Scalars.size() != Mask.size() &&
10468 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10470 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10473 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10474 CommonVF == CommonMask.
size() &&
10476 [](
const auto &&
P) {
10478 static_cast<unsigned>(
P.value()) !=
P.index();
10486 }
else if (V1 && P2.
isNull()) {
10488 ExtraCost += GetValueMinBWAffectedCost(V1);
10489 CommonVF = getVF(V1);
10492 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10493 "All elements in mask must be less than CommonVF.");
10494 }
else if (V1 && !V2) {
10496 unsigned VF = getVF(V1);
10497 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10498 CommonVF = std::max(VF, E2->getVectorFactor());
10501 return Idx < 2 * static_cast<int>(CommonVF);
10503 "All elements in mask must be less than 2 * CommonVF.");
10504 if (E2->Scalars.size() == VF && VF != CommonVF) {
10506 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10507 for (
int &
Idx : CommonMask) {
10510 if (
Idx >=
static_cast<int>(CommonVF))
10511 Idx = E2Mask[
Idx - CommonVF] + VF;
10515 ExtraCost += GetValueMinBWAffectedCost(V1);
10517 ExtraCost += GetNodeMinBWAffectedCost(
10518 *E2, std::min(CommonVF, E2->getVectorFactor()));
10519 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10520 }
else if (!V1 && V2) {
10522 unsigned VF = getVF(V2);
10523 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10524 CommonVF = std::max(VF, E1->getVectorFactor());
10527 return Idx < 2 * static_cast<int>(CommonVF);
10529 "All elements in mask must be less than 2 * CommonVF.");
10530 if (E1->Scalars.size() == VF && VF != CommonVF) {
10532 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10533 for (
int &
Idx : CommonMask) {
10536 if (
Idx >=
static_cast<int>(CommonVF))
10537 Idx = E1Mask[
Idx - CommonVF] + VF;
10543 ExtraCost += GetNodeMinBWAffectedCost(
10544 *E1, std::min(CommonVF, E1->getVectorFactor()));
10546 ExtraCost += GetValueMinBWAffectedCost(V2);
10547 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10549 assert(V1 && V2 &&
"Expected both vectors.");
10550 unsigned VF = getVF(V1);
10551 CommonVF = std::max(VF, getVF(V2));
10554 return Idx < 2 * static_cast<int>(CommonVF);
10556 "All elements in mask must be less than 2 * CommonVF.");
10558 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10559 if (V1->
getType() != V2->getType()) {
10561 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10563 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10565 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10566 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10569 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10574 InVectors.
front() =
10576 if (InVectors.
size() == 2)
10578 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10579 V1, V2, CommonMask, Builder);
10586 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10587 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10588 CheckedExtracts(CheckedExtracts) {}
10590 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10591 unsigned NumParts,
bool &UseVecBaseAsInput) {
10592 UseVecBaseAsInput =
false;
10595 Value *VecBase =
nullptr;
10597 if (!E->ReorderIndices.empty()) {
10599 E->ReorderIndices.end());
10604 bool PrevNodeFound =
any_of(
10606 [&](
const std::unique_ptr<TreeEntry> &TE) {
10607 return ((!TE->isAltShuffle() &&
10608 TE->getOpcode() == Instruction::ExtractElement) ||
10610 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10611 return VL.size() > Data.index() &&
10612 (Mask[Data.index()] == PoisonMaskElem ||
10613 isa<UndefValue>(VL[Data.index()]) ||
10614 Data.value() == VL[Data.index()]);
10619 for (
unsigned Part : seq<unsigned>(NumParts)) {
10621 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10625 if (isa<UndefValue>(V) ||
10634 auto *EE = cast<ExtractElementInst>(V);
10635 VecBase = EE->getVectorOperand();
10636 UniqueBases.
insert(VecBase);
10637 const TreeEntry *VE = R.getTreeEntry(V);
10638 if (!CheckedExtracts.
insert(V).second ||
10639 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10642 return isa<GetElementPtrInst>(U) &&
10643 !R.areAllUsersVectorized(cast<Instruction>(U),
10651 unsigned Idx = *EEIdx;
10653 if (EE->hasOneUse() || !PrevNodeFound) {
10655 if (isa<SExtInst, ZExtInst>(Ext) &&
10656 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10661 EE->getVectorOperandType(),
Idx);
10664 Ext->getOpcode(), Ext->getType(), EE->getType(),
10679 if (!PrevNodeFound)
10680 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10683 transformMaskAfterShuffle(CommonMask, CommonMask);
10684 SameNodesEstimated =
false;
10685 if (NumParts != 1 && UniqueBases.
size() != 1) {
10686 UseVecBaseAsInput =
true;
10694 std::optional<InstructionCost>
10698 return std::nullopt;
10704 return Idx < static_cast<int>(E1.getVectorFactor());
10706 "Expected single vector shuffle mask.");
10710 if (InVectors.
empty()) {
10711 CommonMask.
assign(Mask.begin(), Mask.end());
10712 InVectors.
assign({&E1, &E2});
10715 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10718 if (NumParts == 0 || NumParts >= Mask.size() ||
10719 MaskVecTy->getNumElements() % NumParts != 0 ||
10721 MaskVecTy->getNumElements() / NumParts))
10726 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10727 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10730 if (InVectors.
empty()) {
10731 CommonMask.
assign(Mask.begin(), Mask.end());
10732 InVectors.
assign(1, &E1);
10735 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10738 if (NumParts == 0 || NumParts >= Mask.size() ||
10739 MaskVecTy->getNumElements() % NumParts != 0 ||
10741 MaskVecTy->getNumElements() / NumParts))
10746 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10747 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10748 if (!SameNodesEstimated && InVectors.
size() == 1)
10760 auto *EI = cast<ExtractElementInst>(
10761 cast<const TreeEntry *>(InVectors.
front())
10762 ->getOrdered(
P.index()));
10763 return EI->getVectorOperand() == V1 ||
10764 EI->getVectorOperand() == V2;
10766 "Expected extractelement vectors.");
10770 if (InVectors.
empty()) {
10772 "Expected empty input mask/vectors.");
10773 CommonMask.
assign(Mask.begin(), Mask.end());
10774 InVectors.
assign(1, V1);
10779 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10780 !CommonMask.
empty() &&
10783 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10784 ->getOrdered(
P.index());
10786 return P.value() == Mask[
P.index()] ||
10787 isa<UndefValue>(Scalar);
10788 if (isa<Constant>(V1))
10790 auto *EI = cast<ExtractElementInst>(Scalar);
10791 return EI->getVectorOperand() == V1;
10793 "Expected only tree entry for extractelement vectors.");
10797 "Expected only tree entries from extracts/reused buildvectors.");
10798 unsigned VF = getVF(V1);
10799 if (InVectors.
size() == 2) {
10800 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10801 transformMaskAfterShuffle(CommonMask, CommonMask);
10802 VF = std::max<unsigned>(VF, CommonMask.
size());
10803 }
else if (
const auto *InTE =
10804 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10805 VF = std::max(VF, InTE->getVectorFactor());
10808 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10809 ->getNumElements());
10812 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10814 CommonMask[
Idx] = Mask[
Idx] + VF;
10817 Value *Root =
nullptr) {
10818 Cost += getBuildVectorCost(VL, Root);
10822 unsigned VF = VL.
size();
10824 VF = std::min(VF, MaskVF);
10826 if (isa<UndefValue>(V)) {
10832 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10839 Type *ScalarTy = V->getType()->getScalarType();
10841 if (isa<PoisonValue>(V))
10843 else if (isa<UndefValue>(V))
10847 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10850 Vals.
swap(NewVals);
10856 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10863 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10866 IsFinalized =
true;
10869 if (InVectors.
size() == 2)
10870 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10872 Cost += createShuffle(Vec,
nullptr, CommonMask);
10873 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10877 "Expected vector length for the final value before action.");
10878 Value *V = cast<Value *>(Vec);
10879 Action(V, CommonMask);
10880 InVectors.
front() = V;
10882 if (!SubVectors.empty()) {
10884 if (InVectors.
size() == 2)
10885 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10887 Cost += createShuffle(Vec,
nullptr, CommonMask);
10888 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10892 if (!SubVectorsMask.
empty()) {
10894 "Expected same size of masks for subvectors and common mask.");
10896 copy(SubVectorsMask, SVMask.begin());
10897 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10900 I1 = I2 + CommonMask.
size();
10907 for (
auto [E,
Idx] : SubVectors) {
10908 Type *EScalarTy = E->Scalars.front()->getType();
10909 bool IsSigned =
true;
10910 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10913 IsSigned = It->second.second;
10915 if (ScalarTy != EScalarTy) {
10916 unsigned CastOpcode = Instruction::Trunc;
10917 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10918 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10920 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10930 if (!CommonMask.
empty()) {
10931 std::iota(std::next(CommonMask.
begin(),
Idx),
10932 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
10939 if (CommonMask.
empty()) {
10940 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
10944 createShuffle(InVectors.
front(),
10945 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
10951 "Shuffle construction must be finalized.");
10955const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
10956 unsigned Idx)
const {
10957 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
10960 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10961 return TE->isGather() &&
10962 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
10963 return EI.EdgeIdx == Idx && EI.UserTE == E;
10964 }) != TE->UserTreeIndices.end();
10966 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
10971 if (TE.State == TreeEntry::ScatterVectorize ||
10972 TE.State == TreeEntry::StridedVectorize)
10974 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10975 !TE.isAltShuffle()) {
10976 if (TE.ReorderIndices.empty())
10990 const unsigned VF,
unsigned MinBW,
11022 auto It = MinBWs.
find(E);
11023 Type *OrigScalarTy = ScalarTy;
11024 if (It != MinBWs.
end()) {
11025 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11031 unsigned EntryVF = E->getVectorFactor();
11034 if (E->isGather()) {
11037 if (isa<InsertElementInst>(VL[0]))
11039 if (isa<CmpInst>(VL.
front()))
11040 ScalarTy = VL.
front()->getType();
11041 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11042 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11046 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11049 if (E->getOpcode() == Instruction::Store) {
11051 NewMask.
resize(E->ReorderIndices.size());
11052 copy(E->ReorderIndices, NewMask.
begin());
11058 if (!E->ReuseShuffleIndices.empty())
11059 ::addMask(Mask, E->ReuseShuffleIndices);
11063 assert((E->State == TreeEntry::Vectorize ||
11064 E->State == TreeEntry::ScatterVectorize ||
11065 E->State == TreeEntry::StridedVectorize) &&
11066 "Unhandled state");
11067 assert(E->getOpcode() &&
11069 (E->getOpcode() == Instruction::GetElementPtr &&
11070 E->getMainOp()->getType()->isPointerTy())) &&
11073 unsigned ShuffleOrOp =
11074 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11075 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11076 ShuffleOrOp = E->CombinedOp;
11078 const unsigned Sz = UniqueValues.
size();
11080 for (
unsigned I = 0;
I < Sz; ++
I) {
11081 if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11083 UsedScalars.set(
I);
11085 auto GetCastContextHint = [&](
Value *
V) {
11086 if (
const TreeEntry *OpTE = getTreeEntry(V))
11087 return getCastContextHint(*OpTE);
11088 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11089 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
11098 if (isa<CastInst, CallInst>(VL0)) {
11102 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11104 for (
unsigned I = 0;
I < Sz; ++
I) {
11105 if (UsedScalars.test(
I))
11107 ScalarCost += ScalarEltCost(
I);
11116 (E->getOpcode() != Instruction::Load ||
11117 !E->UserTreeIndices.empty())) {
11118 const EdgeInfo &EI =
11119 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11120 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11122 if (EI.UserTE->getOpcode() != Instruction::Select ||
11124 auto UserBWIt = MinBWs.
find(EI.UserTE);
11125 Type *UserScalarTy =
11126 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11127 if (UserBWIt != MinBWs.
end())
11129 UserBWIt->second.first);
11130 if (ScalarTy != UserScalarTy) {
11131 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11132 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11133 unsigned VecOpcode;
11134 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11135 if (BWSz > SrcBWSz)
11136 VecOpcode = Instruction::Trunc;
11139 It->second.second ? Instruction::SExt : Instruction::ZExt;
11146 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11147 ScalarCost,
"Calculated costs for Tree"));
11148 return VecCost - ScalarCost;
11153 assert((E->State == TreeEntry::Vectorize ||
11154 E->State == TreeEntry::StridedVectorize) &&
11155 "Entry state expected to be Vectorize or StridedVectorize here.");
11159 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11160 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11161 "Calculated GEPs cost for Tree"));
11163 return VecCost - ScalarCost;
11170 Type *CanonicalType = Ty;
11177 {CanonicalType, CanonicalType});
11182 if (VI && SelectOnly) {
11184 "Expected only for scalar type.");
11185 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11187 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11188 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11189 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11191 return IntrinsicCost;
11193 switch (ShuffleOrOp) {
11194 case Instruction::PHI: {
11198 for (
Value *V : UniqueValues) {
11199 auto *
PHI = dyn_cast<PHINode>(V);
11204 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11208 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11210 if (!OpTE->ReuseShuffleIndices.empty())
11211 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11212 OpTE->Scalars.size());
11215 return CommonCost - ScalarCost;
11217 case Instruction::ExtractValue:
11218 case Instruction::ExtractElement: {
11219 auto GetScalarCost = [&](
unsigned Idx) {
11220 if (isa<PoisonValue>(UniqueValues[
Idx]))
11223 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11225 if (ShuffleOrOp == Instruction::ExtractElement) {
11226 auto *EE = cast<ExtractElementInst>(
I);
11227 SrcVecTy = EE->getVectorOperandType();
11229 auto *EV = cast<ExtractValueInst>(
I);
11230 Type *AggregateTy = EV->getAggregateOperand()->getType();
11232 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11233 NumElts = ATy->getNumElements();
11238 if (
I->hasOneUse()) {
11240 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11241 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11248 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11256 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11257 return GetCostDiff(GetScalarCost, GetVectorCost);
11259 case Instruction::InsertElement: {
11260 assert(E->ReuseShuffleIndices.empty() &&
11261 "Unique insertelements only are expected.");
11262 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11263 unsigned const NumElts = SrcVecTy->getNumElements();
11264 unsigned const NumScalars = VL.
size();
11270 unsigned OffsetEnd = OffsetBeg;
11271 InsertMask[OffsetBeg] = 0;
11274 if (OffsetBeg >
Idx)
11276 else if (OffsetEnd <
Idx)
11278 InsertMask[
Idx] =
I + 1;
11281 if (NumOfParts > 0 && NumOfParts < NumElts)
11282 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11283 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11285 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11286 unsigned InsertVecSz = std::min<unsigned>(
11288 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11289 bool IsWholeSubvector =
11290 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11294 if (OffsetBeg + InsertVecSz > VecSz) {
11297 InsertVecSz = VecSz;
11303 if (!E->ReorderIndices.empty()) {
11308 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11310 bool IsIdentity =
true;
11312 Mask.swap(PrevMask);
11313 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11315 DemandedElts.
setBit(InsertIdx);
11316 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11317 Mask[InsertIdx - OffsetBeg] =
I;
11319 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11333 InsertVecTy, Mask);
11334 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11335 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11343 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11344 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11345 if (InsertVecSz != VecSz) {
11356 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11365 case Instruction::ZExt:
11366 case Instruction::SExt:
11367 case Instruction::FPToUI:
11368 case Instruction::FPToSI:
11369 case Instruction::FPExt:
11370 case Instruction::PtrToInt:
11371 case Instruction::IntToPtr:
11372 case Instruction::SIToFP:
11373 case Instruction::UIToFP:
11374 case Instruction::Trunc:
11375 case Instruction::FPTrunc:
11376 case Instruction::BitCast: {
11377 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11380 unsigned Opcode = ShuffleOrOp;
11381 unsigned VecOpcode = Opcode;
11383 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11385 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11386 if (SrcIt != MinBWs.
end()) {
11387 SrcBWSz = SrcIt->second.first;
11394 if (BWSz == SrcBWSz) {
11395 VecOpcode = Instruction::BitCast;
11396 }
else if (BWSz < SrcBWSz) {
11397 VecOpcode = Instruction::Trunc;
11398 }
else if (It != MinBWs.
end()) {
11399 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11400 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11401 }
else if (SrcIt != MinBWs.
end()) {
11402 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11404 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11406 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11407 !SrcIt->second.second) {
11408 VecOpcode = Instruction::UIToFP;
11411 assert(
Idx == 0 &&
"Expected 0 index only");
11419 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11421 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11424 bool IsArithmeticExtendedReduction =
11425 E->Idx == 0 && UserIgnoreList &&
11427 auto *
I = cast<Instruction>(V);
11428 return is_contained({Instruction::Add, Instruction::FAdd,
11429 Instruction::Mul, Instruction::FMul,
11430 Instruction::And, Instruction::Or,
11434 if (IsArithmeticExtendedReduction &&
11435 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11437 return CommonCost +
11439 VecOpcode == Opcode ? VI :
nullptr);
11441 return GetCostDiff(GetScalarCost, GetVectorCost);
11443 case Instruction::FCmp:
11444 case Instruction::ICmp:
11445 case Instruction::Select: {
11449 match(VL0, MatchCmp))
11455 auto GetScalarCost = [&](
unsigned Idx) {
11456 if (isa<PoisonValue>(UniqueValues[
Idx]))
11459 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11466 !
match(VI, MatchCmp)) ||
11474 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11475 CostKind, getOperandInfo(
VI->getOperand(0)),
11476 getOperandInfo(
VI->getOperand(1)), VI);
11479 ScalarCost = IntrinsicCost;
11488 CostKind, getOperandInfo(E->getOperand(0)),
11489 getOperandInfo(E->getOperand(1)), VL0);
11490 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11493 unsigned CondNumElements = CondType->getNumElements();
11495 assert(VecTyNumElements >= CondNumElements &&
11496 VecTyNumElements % CondNumElements == 0 &&
11497 "Cannot vectorize Instruction::Select");
11498 if (CondNumElements != VecTyNumElements) {
11507 return VecCost + CommonCost;
11509 return GetCostDiff(GetScalarCost, GetVectorCost);
11511 case TreeEntry::MinMax: {
11512 auto GetScalarCost = [&](
unsigned Idx) {
11513 return GetMinMaxCost(OrigScalarTy);
11517 return VecCost + CommonCost;
11519 return GetCostDiff(GetScalarCost, GetVectorCost);
11521 case Instruction::FNeg:
11522 case Instruction::Add:
11523 case Instruction::FAdd:
11524 case Instruction::Sub:
11525 case Instruction::FSub:
11526 case Instruction::Mul:
11527 case Instruction::FMul:
11528 case Instruction::UDiv:
11529 case Instruction::SDiv:
11530 case Instruction::FDiv:
11531 case Instruction::URem:
11532 case Instruction::SRem:
11533 case Instruction::FRem:
11534 case Instruction::Shl:
11535 case Instruction::LShr:
11536 case Instruction::AShr:
11537 case Instruction::And:
11538 case Instruction::Or:
11539 case Instruction::Xor: {
11540 auto GetScalarCost = [&](
unsigned Idx) {
11541 if (isa<PoisonValue>(UniqueValues[
Idx]))
11544 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11545 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11554 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11555 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11558 auto *CI = dyn_cast<ConstantInt>(
Op);
11559 return CI && CI->getValue().countr_one() >= It->second.first;
11564 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11568 Op2Info, {},
nullptr, TLI) +
11571 return GetCostDiff(GetScalarCost, GetVectorCost);
11573 case Instruction::GetElementPtr: {
11574 return CommonCost + GetGEPCostDiff(VL, VL0);
11576 case Instruction::Load: {
11577 auto GetScalarCost = [&](
unsigned Idx) {
11578 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11580 VI->getAlign(),
VI->getPointerAddressSpace(),
11583 auto *LI0 = cast<LoadInst>(VL0);
11586 switch (E->State) {
11587 case TreeEntry::Vectorize:
11588 if (
unsigned Factor = E->getInterleaveFactor()) {
11590 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11591 LI0->getPointerAddressSpace(),
CostKind);
11595 Instruction::Load, VecTy, LI0->getAlign(),
11599 case TreeEntry::StridedVectorize: {
11600 Align CommonAlignment =
11601 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11603 Instruction::Load, VecTy, LI0->getPointerOperand(),
11604 false, CommonAlignment,
CostKind);
11607 case TreeEntry::ScatterVectorize: {
11608 Align CommonAlignment =
11609 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11611 Instruction::Load, VecTy, LI0->getPointerOperand(),
11612 false, CommonAlignment,
CostKind);
11615 case TreeEntry::CombinedVectorize:
11616 case TreeEntry::NeedToGather:
11619 return VecLdCost + CommonCost;
11625 if (E->State == TreeEntry::ScatterVectorize)
11631 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11632 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11634 case Instruction::Store: {
11635 bool IsReorder = !E->ReorderIndices.empty();
11636 auto GetScalarCost = [=](
unsigned Idx) {
11637 auto *
VI = cast<StoreInst>(VL[
Idx]);
11640 VI->getAlign(),
VI->getPointerAddressSpace(),
11644 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11648 if (E->State == TreeEntry::StridedVectorize) {
11649 Align CommonAlignment =
11650 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11652 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11653 false, CommonAlignment,
CostKind);
11655 assert(E->State == TreeEntry::Vectorize &&
11656 "Expected either strided or consecutive stores.");
11657 if (
unsigned Factor = E->getInterleaveFactor()) {
11658 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11659 "No reused shuffles expected");
11662 Instruction::Store, VecTy, Factor, std::nullopt,
11663 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11667 Instruction::Store, VecTy, BaseSI->getAlign(),
11668 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11671 return VecStCost + CommonCost;
11675 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11676 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11679 return GetCostDiff(GetScalarCost, GetVectorCost) +
11680 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11682 case Instruction::Call: {
11683 auto GetScalarCost = [&](
unsigned Idx) {
11684 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11695 auto *CI = cast<CallInst>(VL0);
11699 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11701 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11703 return GetCostDiff(GetScalarCost, GetVectorCost);
11705 case Instruction::ShuffleVector: {
11706 if (!
SLPReVec || E->isAltShuffle())
11707 assert(E->isAltShuffle() &&
11712 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11713 "Invalid Shuffle Vector Operand");
11716 auto TryFindNodeWithEqualOperands = [=]() {
11717 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11720 if (
TE->isAltShuffle() &&
11721 ((
TE->getOpcode() == E->getOpcode() &&
11722 TE->getAltOpcode() == E->getAltOpcode()) ||
11723 (
TE->getOpcode() == E->getAltOpcode() &&
11724 TE->getAltOpcode() == E->getOpcode())) &&
11725 TE->hasEqualOperands(*E))
11730 auto GetScalarCost = [&](
unsigned Idx) {
11731 if (isa<PoisonValue>(UniqueValues[
Idx]))
11734 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11735 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11745 if (TryFindNodeWithEqualOperands()) {
11747 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11754 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11756 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11757 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11759 VecCost = TTIRef.getCmpSelInstrCost(
11760 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11761 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11763 VecCost += TTIRef.getCmpSelInstrCost(
11764 E->getOpcode(), VecTy, MaskTy,
11765 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11766 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11769 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11772 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11773 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11775 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11776 if (SrcIt != MinBWs.
end()) {
11777 SrcBWSz = SrcIt->second.first;
11781 if (BWSz <= SrcBWSz) {
11782 if (BWSz < SrcBWSz)
11784 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11788 <<
"SLP: alternate extension, which should be truncated.\n";
11794 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11797 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11801 E->buildAltOpShuffleMask(
11803 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11814 unsigned Opcode0 = E->getOpcode();
11815 unsigned Opcode1 = E->getAltOpcode();
11819 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11821 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11822 return AltVecCost < VecCost ? AltVecCost : VecCost;
11827 if (
SLPReVec && !E->isAltShuffle())
11828 return GetCostDiff(
11833 "Not supported shufflevector usage.");
11834 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11835 unsigned SVNumElements =
11836 cast<FixedVectorType>(SV->getOperand(0)->getType())
11837 ->getNumElements();
11838 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11839 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11843 assert(isa<ShuffleVectorInst>(V) &&
11844 "Not supported shufflevector usage.");
11845 auto *SV = cast<ShuffleVectorInst>(V);
11847 [[maybe_unused]]
bool IsExtractSubvectorMask =
11848 SV->isExtractSubvectorMask(Index);
11849 assert(IsExtractSubvectorMask &&
11850 "Not supported shufflevector usage.");
11851 if (NextIndex != Index)
11853 NextIndex += SV->getShuffleMask().size();
11856 return ::getShuffleCost(
11862 return GetCostDiff(GetScalarCost, GetVectorCost);
11864 case Instruction::Freeze:
11871bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11873 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11875 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11877 return TE->isGather() &&
11879 [
this](
Value *V) { return EphValues.contains(V); }) &&
11881 TE->Scalars.size() < Limit ||
11882 ((
TE->getOpcode() == Instruction::ExtractElement ||
11883 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11885 (
TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()) ||
11886 any_of(
TE->Scalars, IsaPred<LoadInst>));
11890 if (VectorizableTree.size() == 1 &&
11891 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11892 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11894 AreVectorizableGathers(VectorizableTree[0].
get(),
11895 VectorizableTree[0]->Scalars.size()) &&
11896 VectorizableTree[0]->getVectorFactor() > 2)))
11899 if (VectorizableTree.size() != 2)
11907 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11908 AreVectorizableGathers(VectorizableTree[1].
get(),
11909 VectorizableTree[0]->Scalars.size()))
11913 if (VectorizableTree[0]->
isGather() ||
11914 (VectorizableTree[1]->isGather() &&
11915 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11916 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11924 bool MustMatchOrInst) {
11928 Value *ZextLoad = Root;
11929 const APInt *ShAmtC;
11930 bool FoundOr =
false;
11931 while (!isa<ConstantExpr>(ZextLoad) &&
11934 ShAmtC->
urem(8) == 0))) {
11935 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11936 ZextLoad = BinOp->getOperand(0);
11937 if (BinOp->getOpcode() == Instruction::Or)
11942 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11949 Type *SrcTy = Load->getType();
11956 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
11957 << *(cast<Instruction>(Root)) <<
"\n");
11966 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11967 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11975 unsigned NumElts = Stores.
size();
11976 for (
Value *Scalar : Stores) {
11990 if (VectorizableTree.empty()) {
11991 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
11997 if (VectorizableTree.size() == 2 &&
11998 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
11999 VectorizableTree[1]->isGather() &&
12000 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12001 !(
isSplat(VectorizableTree[1]->Scalars) ||
12009 constexpr int Limit = 4;
12011 !VectorizableTree.empty() &&
12012 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12013 return (TE->isGather() &&
12014 TE->getOpcode() != Instruction::ExtractElement &&
12015 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12016 TE->getOpcode() == Instruction::PHI;
12027 if (isFullyVectorizableTinyTree(ForReduction))
12032 bool IsAllowedSingleBVNode =
12033 VectorizableTree.size() > 1 ||
12034 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12035 !VectorizableTree.front()->isAltShuffle() &&
12036 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12037 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12039 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12040 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12041 return isa<ExtractElementInst, UndefValue>(V) ||
12042 (IsAllowedSingleBVNode &&
12043 !V->hasNUsesOrMore(UsesLimit) &&
12044 any_of(V->users(), IsaPred<InsertElementInst>));
12049 if (VectorizableTree.back()->isGather() &&
12050 VectorizableTree.back()->isAltShuffle() &&
12051 VectorizableTree.back()->getVectorFactor() > 2 &&
12053 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12055 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12056 VectorizableTree.back()->getVectorFactor()),
12069 constexpr unsigned SmallTree = 3;
12070 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12073 [](
const std::unique_ptr<TreeEntry> &TE) {
12074 return TE->isGather() &&
12075 TE->getOpcode() == Instruction::Load &&
12083 TreeEntry &E = *VectorizableTree[
Idx];
12086 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12100 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12113 for (
const auto &TEPtr : VectorizableTree) {
12114 if (TEPtr->State != TreeEntry::Vectorize)
12116 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12122 auto *NodeA = DT->
getNode(
A->getParent());
12123 auto *NodeB = DT->
getNode(
B->getParent());
12124 assert(NodeA &&
"Should only process reachable instructions");
12125 assert(NodeB &&
"Should only process reachable instructions");
12126 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12127 "Different nodes should have different DFS numbers");
12128 if (NodeA != NodeB)
12129 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12130 return B->comesBefore(
A);
12140 LiveValues.
erase(PrevInst);
12141 for (
auto &J : PrevInst->
operands()) {
12142 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12143 LiveValues.
insert(cast<Instruction>(&*J));
12147 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12148 for (
auto *
X : LiveValues)
12149 dbgs() <<
" " <<
X->getName();
12150 dbgs() <<
", Looking at ";
12155 unsigned NumCalls = 0;
12159 while (InstIt != PrevInstIt) {
12160 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12161 PrevInstIt = Inst->getParent()->rbegin();
12166 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12167 if (
II->isAssumeLikeIntrinsic())
12171 for (
auto &ArgOp :
II->args())
12172 Tys.push_back(ArgOp->getType());
12173 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12174 FMF = FPMO->getFastMathFlags();
12181 if (IntrCost < CallCost)
12188 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12189 &*PrevInstIt != PrevInst)
12197 for (
auto *
II : LiveValues) {
12198 auto *ScalarTy =
II->getType();
12199 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12200 ScalarTy = VectorTy->getElementType();
12218 const auto *I1 = IE1;
12219 const auto *I2 = IE2;
12231 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12233 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12234 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12236 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12237 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12244struct ValueSelect {
12245 template <
typename U>
12246 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12249 template <
typename U>
12250 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12268template <
typename T>
12274 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12276 auto VMIt = std::next(ShuffleMask.begin());
12279 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12281 if (!IsBaseUndef.
all()) {
12283 std::pair<T *, bool> Res =
12284 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12286 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12290 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12292 auto *V = ValueSelect::get<T *>(
Base);
12294 assert((!V || GetVF(V) == Mask.size()) &&
12295 "Expected base vector of VF number of elements.");
12296 Prev = Action(Mask, {
nullptr, Res.first});
12297 }
else if (ShuffleMask.size() == 1) {
12300 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12306 Prev = Action(Mask, {ShuffleMask.begin()->first});
12310 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12311 unsigned Vec2VF = GetVF(VMIt->first);
12312 if (Vec1VF == Vec2VF) {
12316 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12319 Mask[
I] = SecMask[
I] + Vec1VF;
12322 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12325 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12327 std::pair<T *, bool> Res2 =
12328 ResizeAction(VMIt->first, VMIt->second,
false);
12330 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12337 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12340 Prev = Action(Mask, {Res1.first, Res2.first});
12342 VMIt = std::next(VMIt);
12344 bool IsBaseNotUndef = !IsBaseUndef.
all();
12345 (void)IsBaseNotUndef;
12347 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12349 std::pair<T *, bool> Res =
12350 ResizeAction(VMIt->first, VMIt->second,
false);
12352 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12355 "Multiple uses of scalars.");
12356 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12361 Prev = Action(Mask, {Prev, Res.first});
12369template <
typename T>
struct ShuffledInsertData {
12380 << VectorizableTree.size() <<
".\n");
12382 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12385 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12386 TreeEntry &TE = *VectorizableTree[
I];
12389 if (TE.State == TreeEntry::CombinedVectorize) {
12391 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12392 << *TE.Scalars[0] <<
".\n";
12393 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12396 if (TE.isGather()) {
12397 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12398 E && E->getVectorFactor() == TE.getVectorFactor() &&
12399 E->isSame(TE.Scalars)) {
12404 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12411 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12412 "Expected gather nodes with users only.");
12418 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12427 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12434 for (ExternalUser &EU : ExternalUses) {
12435 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12437 for (ExternalUser &EU : ExternalUses) {
12441 if (EphValues.
count(EU.User))
12447 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12450 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12454 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12455 !ExtractCostCalculated.
insert(EU.Scalar).second)
12459 if (isa<FixedVectorType>(EU.Scalar->getType()))
12464 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12466 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12467 if (!UsedInserts.
insert(VU).second)
12471 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12474 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12479 Value *Op0 =
II->getOperand(0);
12480 if (getTreeEntry(
II) && !getTreeEntry(Op0))
12486 if (It == ShuffledInserts.
end()) {
12488 Data.InsertElements.emplace_back(VU);
12490 VecId = ShuffledInserts.
size() - 1;
12491 auto It = MinBWs.
find(ScalarTE);
12492 if (It != MinBWs.
end() &&
12494 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12496 unsigned BWSz = It->second.first;
12497 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12498 unsigned VecOpcode;
12499 if (DstBWSz < BWSz)
12500 VecOpcode = Instruction::Trunc;
12503 It->second.second ? Instruction::SExt : Instruction::ZExt;
12508 FTy->getNumElements()),
12511 <<
" for extending externally used vector with "
12512 "non-equal minimum bitwidth.\n");
12517 It->InsertElements.front() = VU;
12518 VecId = std::distance(ShuffledInserts.
begin(), It);
12520 int InIdx = *InsertIdx;
12522 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12525 Mask[InIdx] = EU.Lane;
12526 DemandedElts[VecId].setBit(InIdx);
12537 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12538 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12539 auto It = MinBWs.
find(Entry);
12540 if (It != MinBWs.
end()) {
12543 ? Instruction::ZExt
12544 : Instruction::SExt;
12551 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12554 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12555 Entry->getOpcode() == Instruction::Load) {
12557 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12558 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12559 auto *
I = cast<Instruction>(U.Scalar);
12560 const Loop *L = LI->getLoopFor(Phi->getParent());
12561 return L && (Phi->getParent() ==
I->getParent() ||
12562 L == LI->getLoopFor(
I->getParent()));
12566 if (!ValueToExtUses) {
12567 ValueToExtUses.emplace();
12570 if (IsPhiInLoop(
P.value()))
12573 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12578 auto *Inst = cast<Instruction>(EU.Scalar);
12580 auto OperandIsScalar = [&](
Value *V) {
12581 if (!getTreeEntry(V)) {
12585 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12586 return !EE->hasOneUse() || !MustGather.contains(EE);
12589 return ValueToExtUses->contains(V);
12591 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12592 bool CanBeUsedAsScalarCast =
false;
12593 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12594 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12595 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12597 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12600 if (ScalarCost + OpCost <= ExtraCost) {
12601 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12602 ScalarCost += OpCost;
12606 if (CanBeUsedAsScalar) {
12607 bool KeepScalar = ScalarCost <= ExtraCost;
12611 bool IsProfitablePHIUser =
12613 VectorizableTree.front()->Scalars.size() > 2)) &&
12614 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12618 auto *PHIUser = dyn_cast<PHINode>(U);
12619 return (!PHIUser ||
12620 PHIUser->getParent() !=
12622 VectorizableTree.front()->getMainOp())
12627 return ValueToExtUses->contains(V);
12629 if (IsProfitablePHIUser) {
12633 (!GatheredLoadsEntriesFirst.has_value() ||
12634 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12635 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12636 return ValueToExtUses->contains(V);
12638 auto It = ExtractsCount.
find(Entry);
12639 if (It != ExtractsCount.
end()) {
12640 assert(ScalarUsesCount >= It->getSecond().size() &&
12641 "Expected total number of external uses not less than "
12642 "number of scalar uses.");
12643 ScalarUsesCount -= It->getSecond().size();
12648 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12651 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12653 auto It = ValueToExtUses->find(V);
12654 if (It != ValueToExtUses->end()) {
12656 ExternalUses[It->second].User = nullptr;
12659 ExtraCost = ScalarCost;
12660 if (!IsPhiInLoop(EU))
12661 ExtractsCount[Entry].
insert(Inst);
12662 if (CanBeUsedAsScalarCast) {
12663 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12666 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12668 auto It = ValueToExtUses->find(V);
12669 if (It != ValueToExtUses->end()) {
12671 ExternalUses[It->second].User = nullptr;
12680 ExtractCost += ExtraCost;
12684 for (
Value *V : ScalarOpsFromCasts) {
12685 ExternalUsesAsOriginalScalar.
insert(V);
12686 if (
const TreeEntry *E = getTreeEntry(V)) {
12687 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12691 if (!VectorizedVals.
empty()) {
12692 const TreeEntry &Root = *VectorizableTree.front();
12693 auto BWIt = MinBWs.find(&Root);
12694 if (BWIt != MinBWs.end()) {
12695 Type *DstTy = Root.Scalars.front()->getType();
12698 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12699 if (OriginalSz != SrcSz) {
12700 unsigned Opcode = Instruction::Trunc;
12701 if (OriginalSz > SrcSz)
12702 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12704 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12716 Cost += SpillCost + ExtractCost;
12720 unsigned VF =
Mask.size();
12721 unsigned VecVF =
TE->getVectorFactor();
12723 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12726 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12732 dbgs() <<
"SLP: Adding cost " <<
C
12733 <<
" for final shuffle of insertelement external users.\n";
12734 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12736 return std::make_pair(TE,
true);
12738 return std::make_pair(TE,
false);
12741 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12742 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12743 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12747 assert((TEs.size() == 1 || TEs.size() == 2) &&
12748 "Expected exactly 1 or 2 tree entries.");
12749 if (TEs.size() == 1) {
12751 VF = TEs.front()->getVectorFactor();
12752 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12756 (
Data.index() < VF &&
12757 static_cast<int>(
Data.index()) ==
Data.value());
12762 <<
" for final shuffle of insertelement "
12763 "external users.\n";
12764 TEs.front()->
dump();
12765 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12771 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12772 VF = TEs.front()->getVectorFactor();
12776 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12780 <<
" for final shuffle of vector node and external "
12781 "insertelement users.\n";
12782 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12783 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12789 (void)performExtractsShuffleAction<const TreeEntry>(
12791 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12792 EstimateShufflesCost);
12794 cast<FixedVectorType>(
12795 ShuffledInserts[
I].InsertElements.front()->getType()),
12798 Cost -= InsertCost;
12802 if (ReductionBitWidth != 0) {
12803 assert(UserIgnoreList &&
"Expected reduction tree.");
12804 const TreeEntry &E = *VectorizableTree.front();
12805 auto It = MinBWs.find(&E);
12806 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12807 unsigned SrcSize = It->second.first;
12808 unsigned DstSize = ReductionBitWidth;
12809 unsigned Opcode = Instruction::Trunc;
12810 if (SrcSize < DstSize) {
12811 bool IsArithmeticExtendedReduction =
12813 auto *
I = cast<Instruction>(V);
12814 return is_contained({Instruction::Add, Instruction::FAdd,
12815 Instruction::Mul, Instruction::FMul,
12816 Instruction::And, Instruction::Or,
12820 if (IsArithmeticExtendedReduction)
12822 Instruction::BitCast;
12824 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12826 if (Opcode != Instruction::BitCast) {
12828 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12830 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12833 switch (E.getOpcode()) {
12834 case Instruction::SExt:
12835 case Instruction::ZExt:
12836 case Instruction::Trunc: {
12837 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12838 CCH = getCastContextHint(*OpTE);
12848 <<
" for final resize for reduction from " << SrcVecTy
12849 <<
" to " << DstVecTy <<
"\n";
12850 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12859 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12860 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12861 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12865 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12876std::optional<TTI::ShuffleKind>
12877BoUpSLP::tryToGatherSingleRegisterExtractElements(
12883 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12884 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12886 if (isa<UndefValue>(VL[
I]))
12890 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12891 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12904 ExtractMask.reset(*
Idx);
12909 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12914 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12915 return P1.second.size() > P2.second.size();
12918 const int UndefSz = UndefVectorExtracts.
size();
12919 unsigned SingleMax = 0;
12920 unsigned PairMax = 0;
12921 if (!Vectors.
empty()) {
12922 SingleMax = Vectors.
front().second.size() + UndefSz;
12923 if (Vectors.
size() > 1) {
12924 auto *ItNext = std::next(Vectors.
begin());
12925 PairMax = SingleMax + ItNext->second.size();
12928 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12929 return std::nullopt;
12935 if (SingleMax >= PairMax && SingleMax) {
12936 for (
int Idx : Vectors.
front().second)
12938 }
else if (!Vectors.
empty()) {
12939 for (
unsigned Idx : {0, 1})
12940 for (
int Idx : Vectors[
Idx].second)
12944 for (
int Idx : UndefVectorExtracts)
12948 std::optional<TTI::ShuffleKind> Res =
12954 return std::nullopt;
12958 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
12959 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
12960 isa<UndefValue>(GatheredExtracts[
I])) {
12964 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12965 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
12966 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
12981 unsigned NumParts)
const {
12982 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
12986 for (
unsigned Part : seq<unsigned>(NumParts)) {
12992 std::optional<TTI::ShuffleKind> Res =
12993 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
12994 ShufflesRes[Part] = Res;
12995 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
12997 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
12998 return Res.has_value();
13000 ShufflesRes.clear();
13001 return ShufflesRes;
13004std::optional<TargetTransformInfo::ShuffleKind>
13005BoUpSLP::isGatherShuffledSingleRegisterEntry(
13011 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13012 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13013 :
TE->UserTreeIndices.front();
13014 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13018 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13019 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13022 TEInsertBlock = TEInsertPt->
getParent();
13025 return std::nullopt;
13026 auto *NodeUI = DT->
getNode(TEInsertBlock);
13027 assert(NodeUI &&
"Should only process reachable instructions");
13029 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13043 auto *NodeEUI = DT->
getNode(InsertBlock);
13046 assert((NodeUI == NodeEUI) ==
13047 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13048 "Different nodes should have different DFS numbers");
13050 if (TEInsertPt->
getParent() != InsertBlock &&
13053 if (TEInsertPt->
getParent() == InsertBlock &&
13067 for (
Value *V : VL) {
13072 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13073 if (TEPtr == TE || TEPtr->Idx == 0)
13076 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13077 "Must contain at least single gathered value.");
13078 assert(TEPtr->UserTreeIndices.size() == 1 &&
13079 "Expected only single user of a gather node.");
13080 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13082 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13085 : &getLastInstructionInBundle(UseEI.UserTE);
13086 if (TEInsertPt == InsertPt) {
13090 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13094 if (TEUseEI.UserTE != UseEI.UserTE &&
13095 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13101 if ((TEInsertBlock != InsertPt->
getParent() ||
13102 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13103 !CheckOrdering(InsertPt))
13107 if (
const TreeEntry *VTE = getTreeEntry(V)) {
13108 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13109 if (VTE->State != TreeEntry::Vectorize) {
13110 auto It = MultiNodeScalars.
find(V);
13111 if (It == MultiNodeScalars.
end())
13113 VTE = *It->getSecond().begin();
13115 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13116 return MTE->State == TreeEntry::Vectorize;
13118 if (MIt == It->getSecond().end())
13123 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13124 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13128 if (VToTEs.
empty())
13130 if (UsedTEs.
empty()) {
13144 if (!VToTEs.
empty()) {
13150 VToTEs = SavedVToTEs;
13159 if (UsedTEs.
size() == 2)
13161 UsedTEs.push_back(SavedVToTEs);
13168 if (UsedTEs.
empty()) {
13170 return std::nullopt;
13174 if (UsedTEs.
size() == 1) {
13177 UsedTEs.front().
end());
13178 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13179 return TE1->Idx < TE2->Idx;
13182 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13183 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13185 if (It != FirstEntries.end() &&
13186 ((*It)->getVectorFactor() == VL.size() ||
13187 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13188 TE->ReuseShuffleIndices.size() == VL.size() &&
13189 (*It)->isSame(
TE->Scalars)))) {
13190 Entries.push_back(*It);
13191 if ((*It)->getVectorFactor() == VL.size()) {
13192 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13193 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13199 for (
unsigned I : seq<unsigned>(VL.size()))
13200 if (isa<PoisonValue>(VL[
I]))
13206 Entries.push_back(FirstEntries.front());
13207 VF = FirstEntries.front()->getVectorFactor();
13210 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13213 for (
const TreeEntry *TE : UsedTEs.front()) {
13214 unsigned VF =
TE->getVectorFactor();
13215 auto It = VFToTE.
find(VF);
13216 if (It != VFToTE.
end()) {
13217 if (It->second->Idx >
TE->Idx)
13218 It->getSecond() =
TE;
13225 UsedTEs.back().
end());
13226 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13227 return TE1->Idx < TE2->Idx;
13229 for (
const TreeEntry *TE : SecondEntries) {
13230 auto It = VFToTE.
find(
TE->getVectorFactor());
13231 if (It != VFToTE.
end()) {
13233 Entries.push_back(It->second);
13234 Entries.push_back(TE);
13240 if (Entries.empty()) {
13242 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13243 return TE1->Idx < TE2->Idx;
13245 Entries.push_back(SecondEntries.front());
13246 VF = std::max(Entries.front()->getVectorFactor(),
13247 Entries.back()->getVectorFactor());
13249 VF = Entries.front()->getVectorFactor();
13253 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13256 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13257 auto *
PHI = cast<PHINode>(V);
13258 auto *PHI1 = cast<PHINode>(V1);
13263 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13265 Value *In1 = PHI1->getIncomingValue(
I);
13270 if (cast<Instruction>(In)->
getParent() !=
13280 auto MightBeIgnored = [=](
Value *
V) {
13281 auto *
I = dyn_cast<Instruction>(V);
13282 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13284 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13289 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13291 bool UsedInSameVTE =
false;
13292 auto It = UsedValuesEntry.
find(V1);
13293 if (It != UsedValuesEntry.
end())
13294 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13295 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13297 cast<Instruction>(V)->getParent() ==
13298 cast<Instruction>(V1)->getParent() &&
13299 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13304 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13306 auto It = UsedValuesEntry.
find(V);
13307 if (It == UsedValuesEntry.
end())
13313 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13314 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13316 unsigned Idx = It->second;
13323 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13324 if (!UsedIdxs.test(
I))
13330 for (std::pair<unsigned, int> &Pair : EntryLanes)
13331 if (Pair.first ==
I)
13332 Pair.first = TempEntries.
size();
13335 Entries.swap(TempEntries);
13336 if (EntryLanes.size() == Entries.size() &&
13338 .
slice(Part * VL.size(),
13339 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13345 return std::nullopt;
13348 bool IsIdentity = Entries.size() == 1;
13351 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13352 unsigned Idx = Part * VL.size() + Pair.second;
13355 (ForOrder ? std::distance(
13356 Entries[Pair.first]->Scalars.begin(),
13357 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13358 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13359 IsIdentity &=
Mask[
Idx] == Pair.second;
13361 if (ForOrder || IsIdentity || Entries.empty()) {
13362 switch (Entries.size()) {
13364 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13368 if (EntryLanes.size() > 2 || VL.size() <= 2)
13374 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13375 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13378 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13379 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13380 for (
int Idx : SubMask) {
13388 assert(MaxElement >= 0 && MinElement >= 0 &&
13389 MaxElement % VF >= MinElement % VF &&
13390 "Expected at least single element.");
13391 unsigned NewVF = std::max<unsigned>(
13393 (MaxElement % VF) -
13394 (MinElement % VF) + 1));
13399 Idx = (
Idx % VF) - (MinElement % VF) +
13400 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13407 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13408 auto GetShuffleCost = [&,
13412 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13414 Mask, Entries.front()->getInterleaveFactor()))
13416 return ::getShuffleCost(
TTI,
13421 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13424 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13425 FirstShuffleCost = ShuffleCost;
13429 bool IsIdentity =
true;
13431 if (
Idx >=
static_cast<int>(VF)) {
13436 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13440 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13442 MaskVecTy, DemandedElts,
true,
13447 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13448 SecondShuffleCost = ShuffleCost;
13452 bool IsIdentity =
true;
13454 if (
Idx <
static_cast<int>(VF) &&
Idx >= 0) {
13460 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13465 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13467 MaskVecTy, DemandedElts,
true,
13477 const TreeEntry *BestEntry =
nullptr;
13478 if (FirstShuffleCost < ShuffleCost) {
13479 copy(FirstMask, std::next(
Mask.begin(), Part * VL.size()));
13480 BestEntry = Entries.front();
13481 ShuffleCost = FirstShuffleCost;
13483 if (SecondShuffleCost < ShuffleCost) {
13484 copy(SecondMask, std::next(
Mask.begin(), Part * VL.size()));
13485 BestEntry = Entries[1];
13486 ShuffleCost = SecondShuffleCost;
13488 if (BuildVectorCost >= ShuffleCost) {
13491 Entries.push_back(BestEntry);
13499 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13501 return std::nullopt;
13505BoUpSLP::isGatherShuffledEntry(
13509 assert(NumParts > 0 && NumParts < VL.
size() &&
13510 "Expected positive number of registers.");
13513 if (TE == VectorizableTree.front().get() &&
13514 (!GatheredLoadsEntriesFirst.has_value() ||
13516 [](
const std::unique_ptr<TreeEntry> &TE) {
13517 return !
TE->isGather();
13521 if (
TE->isNonPowOf2Vec())
13524 assert((
TE->UserTreeIndices.size() == 1 ||
13525 TE == VectorizableTree.front().get()) &&
13526 "Expected only single user of the gather node.");
13528 "Number of scalars must be divisible by NumParts.");
13529 if (!
TE->UserTreeIndices.empty() &&
13530 TE->UserTreeIndices.front().UserTE->isGather() &&
13531 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13532 assert((
TE->Idx == 0 ||
TE->getOpcode() == Instruction::ExtractElement ||
13534 "Expected splat or extractelements only node.");
13539 for (
unsigned Part : seq<unsigned>(NumParts)) {
13543 std::optional<TTI::ShuffleKind> SubRes =
13544 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13547 SubEntries.
clear();
13550 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13551 (SubEntries.
front()->isSame(
TE->Scalars) ||
13552 SubEntries.
front()->isSame(VL))) {
13554 LocalSubEntries.
swap(SubEntries);
13557 std::iota(
Mask.begin(),
Mask.end(), 0);
13559 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13560 if (isa<PoisonValue>(VL[
I]))
13562 Entries.emplace_back(1, LocalSubEntries.
front());
13568 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13576 Type *ScalarTy)
const {
13578 bool DuplicateNonConst =
false;
13586 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13587 if (
V->getType() != ScalarTy) {
13598 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13601 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13609 EstimateInsertCost(
I, V);
13610 ShuffleMask[
I] =
I;
13614 DuplicateNonConst =
true;
13616 ShuffleMask[
I] = Res.first->second;
13618 if (ForPoisonSrc) {
13619 if (isa<FixedVectorType>(ScalarTy)) {
13625 for (
unsigned I : seq<unsigned>(VL.
size()))
13626 if (!ShuffledElements[
I])
13629 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13637 if (DuplicateNonConst)
13639 VecTy, ShuffleMask);
13643Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13644 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13650 auto *Front = E->getMainOp();
13652 assert(((GatheredLoadsEntriesFirst.has_value() &&
13653 E->getOpcode() == Instruction::Load && E->isGather() &&
13654 E->Idx < *GatheredLoadsEntriesFirst) ||
13656 [=](
Value *V) ->
bool {
13657 if (E->getOpcode() == Instruction::GetElementPtr &&
13658 !isa<GetElementPtrInst>(V))
13660 auto *I = dyn_cast<Instruction>(V);
13661 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13662 isVectorLikeInstWithConstOps(I);
13664 "Expected gathered loads or GEPs or instructions from same basic "
13667 auto FindLastInst = [&]() {
13669 for (
Value *V : E->Scalars) {
13670 auto *
I = dyn_cast<Instruction>(V);
13673 if (LastInst->
getParent() ==
I->getParent()) {
13678 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13679 !isa<GetElementPtrInst>(
I)) ||
13682 (GatheredLoadsEntriesFirst.has_value() &&
13683 E->getOpcode() == Instruction::Load && E->isGather() &&
13684 E->Idx < *GatheredLoadsEntriesFirst)) &&
13685 "Expected vector-like or non-GEP in GEP node insts only.");
13693 auto *NodeB = DT->
getNode(
I->getParent());
13694 assert(NodeA &&
"Should only process reachable instructions");
13695 assert(NodeB &&
"Should only process reachable instructions");
13696 assert((NodeA == NodeB) ==
13697 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13698 "Different nodes should have different DFS numbers");
13699 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13706 auto FindFirstInst = [&]() {
13708 for (
Value *V : E->Scalars) {
13709 auto *
I = dyn_cast<Instruction>(V);
13712 if (FirstInst->
getParent() ==
I->getParent()) {
13713 if (
I->comesBefore(FirstInst))
13717 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13718 !isa<GetElementPtrInst>(
I)) ||
13721 "Expected vector-like or non-GEP in GEP node insts only.");
13729 auto *NodeB = DT->
getNode(
I->getParent());
13730 assert(NodeA &&
"Should only process reachable instructions");
13731 assert(NodeB &&
"Should only process reachable instructions");
13732 assert((NodeA == NodeB) ==
13733 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13734 "Different nodes should have different DFS numbers");
13735 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13742 if (GatheredLoadsEntriesFirst.has_value() &&
13743 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13744 E->getOpcode() == Instruction::Load) {
13745 Res = FindFirstInst();
13753 if ((E->getOpcode() == Instruction::GetElementPtr &&
13756 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13760 return isa<PoisonValue>(V) ||
13761 (!isVectorLikeInstWithConstOps(V) &&
13762 isUsedOutsideBlock(V));
13764 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13765 return isa<ExtractElementInst, UndefValue>(V) ||
13766 areAllOperandsNonInsts(V);
13768 Res = FindLastInst();
13770 Res = FindFirstInst();
13778 if (BlocksSchedules.count(BB) && !E->isGather()) {
13779 Value *
V = E->isOneOf(E->Scalars.back());
13782 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13783 if (Bundle && Bundle->isPartOfBundle())
13784 for (; Bundle; Bundle = Bundle->NextInBundle)
13785 Res = Bundle->Inst;
13807 Res = FindLastInst();
13808 assert(Res &&
"Failed to find last instruction in bundle");
13812void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13813 auto *Front = E->getMainOp();
13814 Instruction *LastInst = &getLastInstructionInBundle(E);
13815 assert(LastInst &&
"Failed to find last instruction in bundle");
13818 bool IsPHI = isa<PHINode>(LastInst);
13820 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13822 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13826 Builder.SetInsertPoint(
13830 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13833Value *BoUpSLP::gather(
13842 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13845 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13846 InsertBB = InsertBB->getSinglePredecessor();
13847 return InsertBB && InsertBB == InstBB;
13849 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13850 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13851 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13852 getTreeEntry(Inst) ||
13853 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13854 PostponedIndices.
insert(
I).second)
13858 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13861 if (
Scalar->getType() != Ty) {
13865 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13866 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13868 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13869 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13872 Scalar = Builder.CreateIntCast(
13877 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13879 Vec = InsElt = Builder.CreateInsertVector(
13882 auto *
II = dyn_cast<IntrinsicInst>(InsElt);
13883 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13886 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13887 InsElt = dyn_cast<InsertElementInst>(Vec);
13891 GatherShuffleExtractSeq.
insert(InsElt);
13894 if (isa<Instruction>(V)) {
13895 if (TreeEntry *Entry = getTreeEntry(V)) {
13897 User *UserOp =
nullptr;
13899 if (
auto *SI = dyn_cast<Instruction>(Scalar))
13905 unsigned FoundLane =
Entry->findLaneForValue(V);
13906 ExternalUses.emplace_back(V, UserOp, FoundLane);
13916 std::iota(
Mask.begin(),
Mask.end(), 0);
13917 Value *OriginalRoot = Root;
13918 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13919 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13920 SV->getOperand(0)->getType() == VecTy) {
13921 Root = SV->getOperand(0);
13922 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13925 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13932 if (isa<PoisonValue>(VL[
I]))
13934 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
13938 if (isa<PoisonValue>(Vec)) {
13939 Vec = OriginalRoot;
13941 Vec = CreateShuffle(Root, Vec, Mask);
13942 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
13943 OI && OI->hasNUses(0) &&
13944 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13945 return TE->VectorizedValue == OI;
13951 for (
int I : NonConsts)
13952 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
13955 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
13956 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
13994 bool IsFinalized =
false;
14007 class ShuffleIRBuilder {
14020 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14021 CSEBlocks(CSEBlocks),
DL(
DL) {}
14022 ~ShuffleIRBuilder() =
default;
14025 if (V1->
getType() != V2->getType()) {
14028 "Expected integer vector types only.");
14029 if (V1->
getType() != V2->getType()) {
14030 if (cast<VectorType>(V2->getType())
14032 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14034 ->getIntegerBitWidth())
14043 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14044 GatherShuffleExtractSeq.
insert(
I);
14045 CSEBlocks.
insert(
I->getParent());
14054 unsigned VF = Mask.size();
14055 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14059 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14060 GatherShuffleExtractSeq.
insert(
I);
14061 CSEBlocks.
insert(
I->getParent());
14065 Value *createIdentity(
Value *V) {
return V; }
14066 Value *createPoison(
Type *Ty,
unsigned VF) {
14071 void resizeToMatch(
Value *&V1,
Value *&V2) {
14072 if (V1->
getType() == V2->getType())
14074 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14075 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14076 int VF = std::max(V1VF, V2VF);
14077 int MinVF = std::min(V1VF, V2VF);
14079 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14081 Value *&
Op = MinVF == V1VF ? V1 : V2;
14083 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14084 GatherShuffleExtractSeq.
insert(
I);
14085 CSEBlocks.
insert(
I->getParent());
14098 assert(V1 &&
"Expected at least one vector value.");
14099 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14100 R.CSEBlocks, *R.DL);
14101 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14109 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14117 std::optional<bool> IsSigned = std::nullopt) {
14118 auto *VecTy = cast<VectorType>(V->getType());
14129 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14133 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14134 unsigned NumParts,
bool &UseVecBaseAsInput) {
14135 UseVecBaseAsInput =
false;
14137 Value *VecBase =
nullptr;
14139 if (!E->ReorderIndices.empty()) {
14141 E->ReorderIndices.end());
14144 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14148 auto *EI = cast<ExtractElementInst>(VL[
I]);
14149 VecBase = EI->getVectorOperand();
14150 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14151 VecBase = TE->VectorizedValue;
14152 assert(VecBase &&
"Expected vectorized value.");
14153 UniqueBases.
insert(VecBase);
14156 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14157 (NumParts != 1 &&
count(VL, EI) > 1) ||
14159 const TreeEntry *UTE = R.getTreeEntry(U);
14160 return !UTE || R.MultiNodeScalars.contains(U) ||
14161 (isa<GetElementPtrInst>(U) &&
14162 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14163 count_if(R.VectorizableTree,
14164 [&](const std::unique_ptr<TreeEntry> &TE) {
14165 return any_of(TE->UserTreeIndices,
14166 [&](const EdgeInfo &Edge) {
14167 return Edge.UserTE == UTE;
14169 is_contained(VL, EI);
14173 R.eraseInstruction(EI);
14175 if (NumParts == 1 || UniqueBases.
size() == 1) {
14176 assert(VecBase &&
"Expected vectorized value.");
14177 return castToScalarTyElem(VecBase);
14179 UseVecBaseAsInput =
true;
14189 Value *Vec =
nullptr;
14192 for (
unsigned Part : seq<unsigned>(NumParts)) {
14196 constexpr int MaxBases = 2;
14198 auto VLMask =
zip(SubVL, SubMask);
14199 const unsigned VF = std::accumulate(
14200 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14201 if (std::get<1>(D) == PoisonMaskElem)
14204 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14205 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14206 VecOp = TE->VectorizedValue;
14207 assert(VecOp &&
"Expected vectorized value.");
14208 const unsigned Size =
14209 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14210 return std::max(S, Size);
14212 for (
const auto [V,
I] : VLMask) {
14215 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14216 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14217 VecOp = TE->VectorizedValue;
14218 assert(VecOp &&
"Expected vectorized value.");
14219 VecOp = castToScalarTyElem(VecOp);
14220 Bases[
I / VF] = VecOp;
14222 if (!Bases.front())
14225 if (Bases.back()) {
14226 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14227 TransformToIdentity(SubMask);
14229 SubVec = Bases.front();
14236 Mask.slice(
P * SliceSize,
14243 "Expected first part or all previous parts masked.");
14244 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14247 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14249 unsigned SubVecVF =
14250 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14251 NewVF = std::max(NewVF, SubVecVF);
14254 for (
int &
Idx : SubMask)
14257 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14258 Vec = createShuffle(Vec, SubVec, VecMask);
14259 TransformToIdentity(VecMask);
14267 std::optional<Value *>
14273 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14275 return std::nullopt;
14278 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14287 Value *V1 = E1.VectorizedValue;
14289 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14290 if (isa<PoisonValue>(V))
14292 return !isKnownNonNegative(
14293 V, SimplifyQuery(*R.DL));
14295 Value *V2 = E2.VectorizedValue;
14296 if (V2->getType()->isIntOrIntVectorTy())
14297 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14298 if (isa<PoisonValue>(V))
14300 return !isKnownNonNegative(
14301 V, SimplifyQuery(*R.DL));
14308 Value *V1 = E1.VectorizedValue;
14310 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14311 if (isa<PoisonValue>(V))
14313 return !isKnownNonNegative(
14314 V, SimplifyQuery(*R.DL));
14320 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14322 isa<FixedVectorType>(V2->getType()) &&
14323 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14324 V1 = castToScalarTyElem(V1);
14325 V2 = castToScalarTyElem(V2);
14326 if (InVectors.
empty()) {
14329 CommonMask.
assign(Mask.begin(), Mask.end());
14333 if (InVectors.
size() == 2) {
14334 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14335 transformMaskAfterShuffle(CommonMask, CommonMask);
14336 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14338 Vec = createShuffle(Vec,
nullptr, CommonMask);
14339 transformMaskAfterShuffle(CommonMask, CommonMask);
14341 V1 = createShuffle(V1, V2, Mask);
14342 unsigned VF = std::max(getVF(V1), getVF(Vec));
14343 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14345 CommonMask[
Idx] =
Idx + VF;
14346 InVectors.
front() = Vec;
14347 if (InVectors.
size() == 2)
14348 InVectors.
back() = V1;
14355 "castToScalarTyElem expects V1 to be FixedVectorType");
14356 V1 = castToScalarTyElem(V1);
14357 if (InVectors.
empty()) {
14359 CommonMask.
assign(Mask.begin(), Mask.end());
14362 const auto *It =
find(InVectors, V1);
14363 if (It == InVectors.
end()) {
14364 if (InVectors.
size() == 2 ||
14367 if (InVectors.
size() == 2) {
14368 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14369 transformMaskAfterShuffle(CommonMask, CommonMask);
14370 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14371 CommonMask.
size()) {
14372 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14373 transformMaskAfterShuffle(CommonMask, CommonMask);
14375 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14376 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14379 V->getType() != V1->
getType()
14381 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14382 ->getNumElements();
14383 if (V->getType() != V1->
getType())
14384 V1 = createShuffle(V1,
nullptr, Mask);
14385 InVectors.
front() = V;
14386 if (InVectors.
size() == 2)
14387 InVectors.
back() = V1;
14394 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14400 int VF = getVF(V1);
14401 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14403 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
14412 Value *Root =
nullptr) {
14413 return R.gather(VL, Root, ScalarTy,
14415 return createShuffle(V1, V2, Mask);
14424 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14427 IsFinalized =
true;
14430 if (ScalarTyNumElements != 1) {
14434 ExtMask = NewExtMask;
14438 if (InVectors.
size() == 2) {
14439 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14442 Vec = createShuffle(Vec,
nullptr, CommonMask);
14444 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14448 "Expected vector length for the final value before action.");
14449 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14452 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14453 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14455 Action(Vec, CommonMask);
14456 InVectors.
front() = Vec;
14458 if (!SubVectors.empty()) {
14460 if (InVectors.
size() == 2) {
14461 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14464 Vec = createShuffle(Vec,
nullptr, CommonMask);
14466 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14469 auto CreateSubVectors = [&](
Value *Vec,
14471 for (
auto [E,
Idx] : SubVectors) {
14472 Value *
V = E->VectorizedValue;
14473 if (
V->getType()->isIntOrIntVectorTy())
14474 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14475 if (isa<PoisonValue>(V))
14477 return !isKnownNonNegative(
14478 V, SimplifyQuery(*R.DL));
14480 unsigned InsertionIndex =
Idx * ScalarTyNumElements;
14481 const unsigned SubVecVF =
14482 cast<FixedVectorType>(
V->getType())->getNumElements();
14483 if (InsertionIndex % SubVecVF == 0) {
14485 Builder.
getInt64(InsertionIndex));
14489 const unsigned VecVF =
14490 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14492 std::iota(
Mask.begin(),
Mask.end(), 0);
14493 for (
unsigned I : seq<unsigned>(
14494 InsertionIndex, (
Idx + SubVecVF) * ScalarTyNumElements))
14496 Vec = createShuffle(Vec, V, Mask);
14498 if (!CommonMask.
empty()) {
14500 std::next(CommonMask.
begin(), InsertionIndex),
14501 std::next(CommonMask.
begin(),
14502 (
Idx + E->getVectorFactor()) * ScalarTyNumElements),
14508 if (SubVectorsMask.
empty()) {
14509 Vec = CreateSubVectors(Vec, CommonMask);
14512 copy(SubVectorsMask, SVMask.begin());
14513 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14516 I1 = I2 + CommonMask.
size();
14521 Vec = createShuffle(InsertVec, Vec, SVMask);
14522 for (
unsigned I : seq<unsigned>(CommonMask.
size())) {
14527 InVectors.
front() = Vec;
14530 if (!ExtMask.
empty()) {
14531 if (CommonMask.
empty()) {
14535 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14538 NewMask[
I] = CommonMask[ExtMask[
I]];
14540 CommonMask.
swap(NewMask);
14543 if (CommonMask.
empty()) {
14544 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14545 return InVectors.
front();
14547 if (InVectors.
size() == 2)
14548 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14549 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14554 "Shuffle construction must be finalized.");
14558BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14559 unsigned NodeIdx) {
14563 if (!S.getOpcode() && VL.
front()->getType()->isPointerTy()) {
14564 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14565 if (It != VL.
end())
14568 if (!S.getOpcode())
14570 auto CheckSameVE = [&](
const TreeEntry *VE) {
14571 return VE->isSame(VL) &&
14572 (
any_of(VE->UserTreeIndices,
14573 [E, NodeIdx](
const EdgeInfo &EI) {
14574 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14576 any_of(VectorizableTree,
14577 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14578 return TE->isOperandGatherNode(
14579 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14580 VE->isSame(TE->Scalars);
14583 TreeEntry *VE = getTreeEntry(S.getMainOp());
14584 if (VE && CheckSameVE(VE))
14586 auto It = MultiNodeScalars.
find(S.getMainOp());
14587 if (It != MultiNodeScalars.
end()) {
14588 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14589 return TE != VE && CheckSameVE(TE);
14591 if (
I != It->getSecond().end())
14597Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14598 bool PostponedPHIs) {
14599 ValueList &VL = E->getOperand(NodeIdx);
14600 const unsigned VF = VL.size();
14601 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14606 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14608 ShuffleInstructionBuilder ShuffleBuilder(
14612 ShuffleBuilder.add(V, Mask);
14614 E->CombinedEntriesWithIndices.size());
14615 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14616 [&](
const auto &
P) {
14617 return std::make_pair(VectorizableTree[P.first].get(),
14620 assert((E->CombinedEntriesWithIndices.empty() ||
14621 E->ReorderIndices.empty()) &&
14622 "Expected either combined subnodes or reordering");
14623 return ShuffleBuilder.finalize({}, SubVectors, {});
14627 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14628 if (!VE->ReuseShuffleIndices.empty()) {
14649 if (isa<PoisonValue>(V))
14651 Mask[
I] = VE->findLaneForValue(V);
14653 V = FinalShuffle(V, Mask);
14655 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14656 "Expected vectorization factor less "
14657 "than original vector size.");
14659 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14660 V = FinalShuffle(V, UniformMask);
14666 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14667 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14668 }) == VE->UserTreeIndices.end()) {
14670 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14671 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14672 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14674 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14675 (*It)->VectorizedValue =
V;
14683 auto *
I =
find_if(VectorizableTree,
14684 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14685 return TE->isOperandGatherNode({E, NodeIdx});
14687 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14688 assert(
I->get()->UserTreeIndices.size() == 1 &&
14689 "Expected only single user for the gather node.");
14690 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14694template <
typename BVTy,
typename ResTy,
typename...
Args>
14695ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14697 assert(E->isGather() &&
"Expected gather node.");
14698 unsigned VF = E->getVectorFactor();
14700 bool NeedFreeze =
false;
14702 E->ReuseShuffleIndices.end());
14705 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14707 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14710 E->CombinedEntriesWithIndices.size());
14711 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14712 [&](
const auto &
P) {
14713 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14718 E->ReorderIndices.end());
14719 if (!ReorderMask.empty())
14725 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14726 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14727 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14730 SubVectorsMask.
clear();
14734 unsigned I,
unsigned SliceSize,
14735 bool IsNotPoisonous) {
14737 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14740 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14741 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14742 if (UserTE->getNumOperands() != 2)
14744 if (!IsNotPoisonous) {
14746 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14747 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14748 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14749 }) !=
TE->UserTreeIndices.end();
14751 if (It == VectorizableTree.end())
14754 if (!(*It)->ReorderIndices.empty()) {
14758 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14759 Value *V0 = std::get<0>(
P);
14760 Value *V1 = std::get<1>(
P);
14761 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14762 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14768 if ((
Mask.size() < InputVF &&
14771 (
Mask.size() == InputVF &&
14774 std::next(
Mask.begin(),
I * SliceSize),
14775 std::next(
Mask.begin(),
14782 std::next(
Mask.begin(),
I * SliceSize),
14783 std::next(
Mask.begin(),
14789 BVTy ShuffleBuilder(ScalarTy, Params...);
14790 ResTy Res = ResTy();
14794 Value *ExtractVecBase =
nullptr;
14795 bool UseVecBaseAsInput =
false;
14798 Type *OrigScalarTy = GatheredScalars.front()->getType();
14801 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14806 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14808 bool Resized =
false;
14810 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14811 if (!ExtractShuffles.
empty()) {
14816 if (
const auto *TE = getTreeEntry(
14817 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14820 if (std::optional<ResTy> Delayed =
14821 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14823 PostponedGathers.
insert(E);
14828 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14829 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14830 ExtractVecBase = VecBase;
14831 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14832 if (VF == VecBaseTy->getNumElements() &&
14833 GatheredScalars.size() != VF) {
14835 GatheredScalars.append(VF - GatheredScalars.size(),
14841 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
14842 ((E->getOpcode() == Instruction::Load ||
14843 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14846 return isa<LoadInst>(V) && getTreeEntry(V);
14848 E->isAltShuffle() ||
14849 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14851 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14853 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14855 if (!GatherShuffles.
empty()) {
14856 if (std::optional<ResTy> Delayed =
14857 ShuffleBuilder.needToDelay(E, Entries)) {
14859 PostponedGathers.
insert(E);
14864 if (GatherShuffles.
size() == 1 &&
14866 Entries.front().front()->isSame(E->Scalars)) {
14869 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14872 Mask.resize(E->Scalars.size());
14873 const TreeEntry *FrontTE = Entries.front().front();
14874 if (FrontTE->ReorderIndices.empty() &&
14875 ((FrontTE->ReuseShuffleIndices.empty() &&
14876 E->Scalars.size() == FrontTE->Scalars.size()) ||
14877 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14878 std::iota(
Mask.begin(),
Mask.end(), 0);
14881 if (isa<PoisonValue>(V)) {
14885 Mask[
I] = FrontTE->findLaneForValue(V);
14888 ShuffleBuilder.add(*FrontTE, Mask);
14889 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14894 if (GatheredScalars.size() != VF &&
14896 return any_of(TEs, [&](
const TreeEntry *TE) {
14897 return TE->getVectorFactor() == VF;
14900 GatheredScalars.append(VF - GatheredScalars.size(),
14904 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14912 bool IsRootPoison) {
14915 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14922 int NumNonConsts = 0;
14925 if (isa<UndefValue>(V)) {
14926 if (!isa<PoisonValue>(V)) {
14941 Scalars.
front() = OrigV;
14944 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
14945 Scalars[Res.first->second] = OrigV;
14946 ReuseMask[
I] = Res.first->second;
14949 if (NumNonConsts == 1) {
14954 if (!UndefPos.
empty() && UndefPos.
front() == 0)
14957 ReuseMask[SinglePos] = SinglePos;
14958 }
else if (!UndefPos.
empty() && IsSplat) {
14963 return !isa<UndefValue>(V) &&
14965 (E->UserTreeIndices.size() == 1 &&
14969 return E->UserTreeIndices.front().EdgeIdx !=
14970 U.getOperandNo() &&
14972 E->UserTreeIndices.front().UserTE->Scalars,
14976 if (It != Scalars.
end()) {
14978 int Pos = std::distance(Scalars.
begin(), It);
14979 for (
int I : UndefPos) {
14981 ReuseMask[
I] = Pos;
14990 for (
int I : UndefPos) {
14992 if (isa<UndefValue>(Scalars[
I]))
14999 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15000 bool IsNonPoisoned =
true;
15001 bool IsUsedInExpr =
true;
15002 Value *Vec1 =
nullptr;
15003 if (!ExtractShuffles.
empty()) {
15007 Value *Vec2 =
nullptr;
15008 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15012 if (UseVecBaseAsInput) {
15013 Vec1 = ExtractVecBase;
15015 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15018 if (isa<UndefValue>(E->Scalars[
I]))
15020 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15021 Value *VecOp = EI->getVectorOperand();
15022 if (
const auto *TE = getTreeEntry(VecOp))
15023 if (
TE->VectorizedValue)
15024 VecOp =
TE->VectorizedValue;
15027 }
else if (Vec1 != VecOp) {
15028 assert((!Vec2 || Vec2 == VecOp) &&
15029 "Expected only 1 or 2 vectors shuffle.");
15035 IsUsedInExpr =
false;
15038 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15041 IsUsedInExpr &= FindReusedSplat(
15043 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15044 ExtractMask.size(), IsNotPoisonedVec);
15045 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15046 IsNonPoisoned &= IsNotPoisonedVec;
15048 IsUsedInExpr =
false;
15053 if (!GatherShuffles.
empty()) {
15056 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15059 "No shuffles with empty entries list expected.");
15063 "Expected shuffle of 1 or 2 entries.");
15067 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15068 if (TEs.
size() == 1) {
15069 bool IsNotPoisonedVec =
15070 TEs.
front()->VectorizedValue
15074 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15075 SliceSize, IsNotPoisonedVec);
15076 ShuffleBuilder.add(*TEs.
front(), VecMask);
15077 IsNonPoisoned &= IsNotPoisonedVec;
15079 IsUsedInExpr =
false;
15080 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15081 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15092 int EMSz = ExtractMask.size();
15093 int MSz =
Mask.size();
15096 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15097 bool IsIdentityShuffle =
15098 ((UseVecBaseAsInput ||
15100 [](
const std::optional<TTI::ShuffleKind> &SK) {
15104 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15106 (!GatherShuffles.
empty() &&
15108 [](
const std::optional<TTI::ShuffleKind> &SK) {
15112 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15114 bool EnoughConstsForShuffle =
15118 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15122 return isa<Constant>(V) && !isa<UndefValue>(V);
15124 (!IsIdentityShuffle ||
15125 (GatheredScalars.size() == 2 &&
15127 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15129 return isa<Constant>(V) && !isa<PoisonValue>(V);
15133 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15134 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15140 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15142 TryPackScalars(GatheredScalars, BVMask,
true);
15143 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15144 ShuffleBuilder.add(BV, BVMask);
15147 return isa<PoisonValue>(V) ||
15148 (IsSingleShuffle && ((IsIdentityShuffle &&
15149 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15151 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15154 Res = ShuffleBuilder.finalize(
15155 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15157 TryPackScalars(NonConstants, Mask,
false);
15158 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15163 TryPackScalars(GatheredScalars, ReuseMask,
true);
15164 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15165 ShuffleBuilder.add(BV, ReuseMask);
15166 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15171 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15172 if (!isa<PoisonValue>(V))
15175 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15176 ShuffleBuilder.add(BV, Mask);
15177 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15182 Res = ShuffleBuilder.createFreeze(Res);
15186Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15187 bool PostponedPHIs) {
15188 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15190 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15198 for (
Value *V : VL)
15199 if (isa<Instruction>(V))
15207 if (E->VectorizedValue &&
15208 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15209 E->isAltShuffle())) {
15210 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15211 return E->VectorizedValue;
15214 Value *
V = E->Scalars.front();
15215 Type *ScalarTy =
V->getType();
15216 if (!isa<CmpInst>(V))
15218 auto It = MinBWs.
find(E);
15219 if (It != MinBWs.
end()) {
15220 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15226 if (E->isGather()) {
15228 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15229 setInsertPointAfterBundle(E);
15230 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15231 E->VectorizedValue = Vec;
15235 bool IsReverseOrder =
15236 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15237 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15238 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15239 if (E->getOpcode() == Instruction::Store &&
15240 E->State == TreeEntry::Vectorize) {
15242 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15243 E->ReorderIndices.size());
15244 ShuffleBuilder.add(V, Mask);
15245 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15246 ShuffleBuilder.addOrdered(V, {});
15248 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15251 E->CombinedEntriesWithIndices.size());
15253 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15254 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15257 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15258 "Expected either combined subnodes or reordering");
15259 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15262 assert(!E->isGather() &&
"Unhandled state");
15263 unsigned ShuffleOrOp =
15264 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15266 auto GetOperandSignedness = [&](
unsigned Idx) {
15267 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15268 bool IsSigned =
false;
15269 auto It = MinBWs.
find(OpE);
15270 if (It != MinBWs.
end())
15271 IsSigned = It->second.second;
15274 if (isa<PoisonValue>(V))
15276 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15280 switch (ShuffleOrOp) {
15281 case Instruction::PHI: {
15282 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15283 E != VectorizableTree.front().get() ||
15284 !E->UserTreeIndices.empty()) &&
15285 "PHI reordering is free.");
15286 if (PostponedPHIs && E->VectorizedValue)
15287 return E->VectorizedValue;
15288 auto *PH = cast<PHINode>(VL0);
15290 PH->getParent()->getFirstNonPHIIt());
15292 if (PostponedPHIs || !E->VectorizedValue) {
15299 PH->getParent()->getFirstInsertionPt());
15302 V = FinalShuffle(V, E);
15304 E->VectorizedValue =
V;
15308 PHINode *NewPhi = cast<PHINode>(E->PHI);
15317 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15323 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15327 if (!VisitedBBs.
insert(IBB).second) {
15334 Value *Vec = vectorizeOperand(E,
I,
true);
15335 if (VecTy != Vec->
getType()) {
15337 MinBWs.
contains(getOperandEntry(E,
I))) &&
15338 "Expected item in MinBWs.");
15339 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15345 "Invalid number of incoming values");
15346 assert(E->VectorizedValue &&
"Expected vectorized value.");
15347 return E->VectorizedValue;
15350 case Instruction::ExtractElement: {
15351 Value *
V = E->getSingleOperand(0);
15352 if (
const TreeEntry *TE = getTreeEntry(V))
15353 V =
TE->VectorizedValue;
15354 setInsertPointAfterBundle(E);
15355 V = FinalShuffle(V, E);
15356 E->VectorizedValue =
V;
15359 case Instruction::ExtractValue: {
15360 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15365 NewV = FinalShuffle(NewV, E);
15366 E->VectorizedValue = NewV;
15369 case Instruction::InsertElement: {
15370 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15372 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15374 Type *ScalarTy =
Op.front()->getType();
15375 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15377 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15378 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15383 cast<FixedVectorType>(
V->getType())->getNumElements()),
15388 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15389 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15391 const unsigned NumElts =
15392 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15393 const unsigned NumScalars = E->Scalars.size();
15396 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15400 if (!E->ReorderIndices.empty()) {
15405 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15408 bool IsIdentity =
true;
15410 Mask.swap(PrevMask);
15411 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15414 IsIdentity &= InsertIdx -
Offset ==
I;
15417 if (!IsIdentity || NumElts != NumScalars) {
15419 bool IsVNonPoisonous =
15422 if (NumElts != NumScalars &&
Offset == 0) {
15431 InsertMask[*InsertIdx] = *InsertIdx;
15432 if (!
Ins->hasOneUse())
15434 Ins = dyn_cast_or_null<InsertElementInst>(
15435 Ins->getUniqueUndroppableUser());
15438 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15440 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15443 if (!IsFirstPoison.
all()) {
15445 for (
unsigned I = 0;
I < NumElts;
I++) {
15447 IsFirstUndef.
test(
I)) {
15448 if (IsVNonPoisonous) {
15449 InsertMask[
I] =
I < NumScalars ?
I : 0;
15454 if (
Idx >= NumScalars)
15455 Idx = NumScalars - 1;
15456 InsertMask[
I] = NumScalars +
Idx;
15470 if (
auto *
I = dyn_cast<Instruction>(V)) {
15471 GatherShuffleExtractSeq.
insert(
I);
15472 CSEBlocks.
insert(
I->getParent());
15477 for (
unsigned I = 0;
I < NumElts;
I++) {
15482 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15485 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15486 NumElts != NumScalars) {
15487 if (IsFirstUndef.
all()) {
15490 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15491 if (!IsFirstPoison.
all()) {
15492 for (
unsigned I = 0;
I < NumElts;
I++) {
15494 InsertMask[
I] =
I + NumElts;
15501 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15502 if (
auto *
I = dyn_cast<Instruction>(V)) {
15503 GatherShuffleExtractSeq.
insert(
I);
15504 CSEBlocks.
insert(
I->getParent());
15509 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15510 for (
unsigned I = 0;
I < NumElts;
I++) {
15514 InsertMask[
I] += NumElts;
15517 FirstInsert->getOperand(0), V, InsertMask,
15518 cast<Instruction>(E->Scalars.back())->getName());
15519 if (
auto *
I = dyn_cast<Instruction>(V)) {
15520 GatherShuffleExtractSeq.
insert(
I);
15521 CSEBlocks.
insert(
I->getParent());
15526 ++NumVectorInstructions;
15527 E->VectorizedValue =
V;
15530 case Instruction::ZExt:
15531 case Instruction::SExt:
15532 case Instruction::FPToUI:
15533 case Instruction::FPToSI:
15534 case Instruction::FPExt:
15535 case Instruction::PtrToInt:
15536 case Instruction::IntToPtr:
15537 case Instruction::SIToFP:
15538 case Instruction::UIToFP:
15539 case Instruction::Trunc:
15540 case Instruction::FPTrunc:
15541 case Instruction::BitCast: {
15542 setInsertPointAfterBundle(E);
15544 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15545 if (E->VectorizedValue) {
15546 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15547 return E->VectorizedValue;
15550 auto *CI = cast<CastInst>(VL0);
15552 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15553 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15555 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15558 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15559 if (SrcIt != MinBWs.
end())
15560 SrcBWSz = SrcIt->second.first;
15562 if (BWSz == SrcBWSz) {
15563 VecOpcode = Instruction::BitCast;
15564 }
else if (BWSz < SrcBWSz) {
15565 VecOpcode = Instruction::Trunc;
15566 }
else if (It != MinBWs.
end()) {
15567 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15568 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15569 }
else if (SrcIt != MinBWs.
end()) {
15570 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15572 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15574 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15575 !SrcIt->second.second) {
15576 VecOpcode = Instruction::UIToFP;
15578 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15580 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15581 V = FinalShuffle(V, E);
15583 E->VectorizedValue =
V;
15584 ++NumVectorInstructions;
15587 case Instruction::FCmp:
15588 case Instruction::ICmp: {
15589 setInsertPointAfterBundle(E);
15591 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15592 if (E->VectorizedValue) {
15593 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15594 return E->VectorizedValue;
15596 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15597 if (E->VectorizedValue) {
15598 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15599 return E->VectorizedValue;
15601 if (
L->getType() !=
R->getType()) {
15603 getOperandEntry(E, 1)->
isGather() ||
15604 MinBWs.
contains(getOperandEntry(E, 0)) ||
15605 MinBWs.
contains(getOperandEntry(E, 1))) &&
15606 "Expected item in MinBWs.");
15607 if (cast<VectorType>(
L->getType())
15609 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15611 ->getIntegerBitWidth()) {
15612 Type *CastTy =
R->getType();
15615 Type *CastTy =
L->getType();
15623 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15624 ICmp->setSameSign(
false);
15626 VecTy = cast<FixedVectorType>(
V->getType());
15627 V = FinalShuffle(V, E);
15629 E->VectorizedValue =
V;
15630 ++NumVectorInstructions;
15633 case Instruction::Select: {
15634 setInsertPointAfterBundle(E);
15636 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15637 if (E->VectorizedValue) {
15638 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15639 return E->VectorizedValue;
15641 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15642 if (E->VectorizedValue) {
15643 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15644 return E->VectorizedValue;
15646 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15647 if (E->VectorizedValue) {
15648 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15649 return E->VectorizedValue;
15653 getOperandEntry(E, 2)->
isGather() ||
15654 MinBWs.
contains(getOperandEntry(E, 1)) ||
15655 MinBWs.
contains(getOperandEntry(E, 2))) &&
15656 "Expected item in MinBWs.");
15657 if (True->
getType() != VecTy)
15658 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15659 if (False->
getType() != VecTy)
15660 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15665 assert(TrueNumElements >= CondNumElements &&
15666 TrueNumElements % CondNumElements == 0 &&
15667 "Cannot vectorize Instruction::Select");
15669 "Cannot vectorize Instruction::Select");
15670 if (CondNumElements != TrueNumElements) {
15678 "Cannot vectorize Instruction::Select");
15680 V = FinalShuffle(V, E);
15682 E->VectorizedValue =
V;
15683 ++NumVectorInstructions;
15686 case Instruction::FNeg: {
15687 setInsertPointAfterBundle(E);
15689 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15691 if (E->VectorizedValue) {
15692 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15693 return E->VectorizedValue;
15699 if (
auto *
I = dyn_cast<Instruction>(V))
15702 V = FinalShuffle(V, E);
15704 E->VectorizedValue =
V;
15705 ++NumVectorInstructions;
15709 case Instruction::Freeze: {
15710 setInsertPointAfterBundle(E);
15712 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15714 if (E->VectorizedValue) {
15715 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15716 return E->VectorizedValue;
15719 if (
Op->getType() != VecTy) {
15721 MinBWs.
contains(getOperandEntry(E, 0))) &&
15722 "Expected item in MinBWs.");
15726 V = FinalShuffle(V, E);
15728 E->VectorizedValue =
V;
15729 ++NumVectorInstructions;
15733 case Instruction::Add:
15734 case Instruction::FAdd:
15735 case Instruction::Sub:
15736 case Instruction::FSub:
15737 case Instruction::Mul:
15738 case Instruction::FMul:
15739 case Instruction::UDiv:
15740 case Instruction::SDiv:
15741 case Instruction::FDiv:
15742 case Instruction::URem:
15743 case Instruction::SRem:
15744 case Instruction::FRem:
15745 case Instruction::Shl:
15746 case Instruction::LShr:
15747 case Instruction::AShr:
15748 case Instruction::And:
15749 case Instruction::Or:
15750 case Instruction::Xor: {
15751 setInsertPointAfterBundle(E);
15753 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15754 if (E->VectorizedValue) {
15755 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15756 return E->VectorizedValue;
15758 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15759 if (E->VectorizedValue) {
15760 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15761 return E->VectorizedValue;
15763 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15764 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15767 auto *CI = dyn_cast<ConstantInt>(
Op);
15768 return CI && CI->getValue().countr_one() >= It->second.first;
15770 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15771 E->VectorizedValue =
V;
15772 ++NumVectorInstructions;
15779 getOperandEntry(E, 1)->
isGather() ||
15780 MinBWs.
contains(getOperandEntry(E, 0)) ||
15781 MinBWs.
contains(getOperandEntry(E, 1))) &&
15782 "Expected item in MinBWs.");
15793 if (
auto *
I = dyn_cast<Instruction>(V)) {
15796 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15798 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15800 I->setHasNoUnsignedWrap(
false);
15803 V = FinalShuffle(V, E);
15805 E->VectorizedValue =
V;
15806 ++NumVectorInstructions;
15810 case Instruction::Load: {
15813 setInsertPointAfterBundle(E);
15815 LoadInst *LI = cast<LoadInst>(VL0);
15818 if (E->State == TreeEntry::Vectorize) {
15820 }
else if (E->State == TreeEntry::StridedVectorize) {
15821 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15822 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15823 PO = IsReverseOrder ? PtrN : Ptr0;
15829 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15831 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15832 DL->getTypeAllocSize(ScalarTy));
15836 return cast<LoadInst>(V)->getPointerOperand();
15839 std::optional<Value *> Stride =
15848 (IsReverseOrder ? -1 : 1) *
15849 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15851 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15853 Intrinsic::experimental_vp_strided_load,
15854 {VecTy, PO->
getType(), StrideTy},
15856 Builder.
getInt32(E->Scalars.size())});
15862 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15863 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15864 if (E->VectorizedValue) {
15865 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15866 return E->VectorizedValue;
15868 if (isa<FixedVectorType>(ScalarTy)) {
15872 unsigned ScalarTyNumElements =
15873 cast<FixedVectorType>(ScalarTy)->getNumElements();
15874 unsigned VecTyNumElements =
15875 cast<FixedVectorType>(VecTy)->getNumElements();
15876 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15877 "Cannot expand getelementptr.");
15878 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15881 return Builder.getInt64(I % ScalarTyNumElements);
15890 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15895 V = FinalShuffle(V, E);
15896 E->VectorizedValue =
V;
15897 ++NumVectorInstructions;
15900 case Instruction::Store: {
15901 auto *
SI = cast<StoreInst>(VL0);
15903 setInsertPointAfterBundle(E);
15905 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15906 if (VecValue->
getType() != VecTy)
15908 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15909 VecValue = FinalShuffle(VecValue, E);
15913 if (E->State == TreeEntry::Vectorize) {
15916 assert(E->State == TreeEntry::StridedVectorize &&
15917 "Expected either strided or consecutive stores.");
15918 if (!E->ReorderIndices.empty()) {
15919 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15920 Ptr =
SI->getPointerOperand();
15922 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15923 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15925 Intrinsic::experimental_vp_strided_store,
15926 {VecTy,
Ptr->getType(), StrideTy},
15929 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15931 Builder.
getInt32(E->Scalars.size())});
15940 E->VectorizedValue =
V;
15941 ++NumVectorInstructions;
15944 case Instruction::GetElementPtr: {
15945 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15946 setInsertPointAfterBundle(E);
15948 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15949 if (E->VectorizedValue) {
15950 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15951 return E->VectorizedValue;
15955 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
15956 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15957 if (E->VectorizedValue) {
15958 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15959 return E->VectorizedValue;
15964 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
15965 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
15967 for (
Value *V : E->Scalars) {
15968 if (isa<GetElementPtrInst>(V))
15974 V = FinalShuffle(V, E);
15976 E->VectorizedValue =
V;
15977 ++NumVectorInstructions;
15981 case Instruction::Call: {
15982 CallInst *CI = cast<CallInst>(VL0);
15983 setInsertPointAfterBundle(E);
15989 It != MinBWs.
end() ? It->second.first : 0,
TTI);
15992 VecCallCosts.first <= VecCallCosts.second;
15994 Value *ScalarArg =
nullptr;
16000 auto *CEI = cast<CallInst>(VL0);
16001 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16006 ScalarArg = CEI->getArgOperand(
I);
16009 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16010 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16018 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16019 if (E->VectorizedValue) {
16020 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16021 return E->VectorizedValue;
16023 ScalarArg = CEI->getArgOperand(
I);
16024 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16026 It == MinBWs.
end()) {
16029 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16030 }
else if (It != MinBWs.
end()) {
16031 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16040 if (!UseIntrinsic) {
16056 V = FinalShuffle(V, E);
16058 E->VectorizedValue =
V;
16059 ++NumVectorInstructions;
16062 case Instruction::ShuffleVector: {
16064 if (
SLPReVec && !E->isAltShuffle()) {
16065 setInsertPointAfterBundle(E);
16066 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16067 if (E->VectorizedValue) {
16068 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16069 return E->VectorizedValue;
16072 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16073 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16074 "Not supported shufflevector usage.");
16077 return SVSrc->getShuffleMask()[Mask];
16084 if (
auto *
I = dyn_cast<Instruction>(V))
16086 V = FinalShuffle(V, E);
16088 assert(E->isAltShuffle() &&
16093 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16094 "Invalid Shuffle Vector Operand");
16098 setInsertPointAfterBundle(E);
16099 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16100 if (E->VectorizedValue) {
16101 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16102 return E->VectorizedValue;
16104 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16106 setInsertPointAfterBundle(E);
16107 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16109 if (E->VectorizedValue) {
16110 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16111 return E->VectorizedValue;
16118 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16119 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16120 MinBWs.
contains(getOperandEntry(E, 0)) ||
16121 MinBWs.
contains(getOperandEntry(E, 1))) &&
16122 "Expected item in MinBWs.");
16123 Type *CastTy = VecTy;
16127 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16129 ->getIntegerBitWidth())
16146 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16147 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16148 auto *AltCI = cast<CmpInst>(E->getAltOp());
16150 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16153 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16154 cast<VectorType>(
LHS->
getType())->getElementType());
16155 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16156 if (BWSz <= SrcBWSz) {
16157 if (BWSz < SrcBWSz)
16160 "Expected same type as operand.");
16161 if (
auto *
I = dyn_cast<Instruction>(LHS))
16163 LHS = FinalShuffle(LHS, E);
16164 E->VectorizedValue =
LHS;
16165 ++NumVectorInstructions;
16176 for (
Value *V : {V0, V1}) {
16177 if (
auto *
I = dyn_cast<Instruction>(V)) {
16178 GatherShuffleExtractSeq.
insert(
I);
16179 CSEBlocks.
insert(
I->getParent());
16188 E->buildAltOpShuffleMask(
16190 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16194 Mask, &OpScalars, &AltScalars);
16198 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16200 if (
auto *
I = dyn_cast<Instruction>(Vec);
16201 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16203 if (isa<PoisonValue>(V))
16205 auto *IV = cast<Instruction>(V);
16206 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16208 I->setHasNoUnsignedWrap(
false);
16210 DropNuwFlag(V0, E->getOpcode());
16211 DropNuwFlag(V1, E->getAltOpcode());
16213 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16218 if (
auto *
I = dyn_cast<Instruction>(V)) {
16220 GatherShuffleExtractSeq.
insert(
I);
16221 CSEBlocks.
insert(
I->getParent());
16225 E->VectorizedValue =
V;
16226 ++NumVectorInstructions;
16245 for (
auto &BSIter : BlocksSchedules) {
16246 scheduleBlock(BSIter.second.get());
16250 EntryToLastInstruction.
clear();
16260 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16261 if (GatheredLoadsEntriesFirst.has_value() &&
16262 TE->Idx >= *GatheredLoadsEntriesFirst &&
16263 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16264 assert((!TE->UserTreeIndices.empty() ||
16265 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16266 "Expected gathered load node.");
16272 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16273 if (TE->State == TreeEntry::Vectorize &&
16274 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16275 TE->VectorizedValue)
16281 for (
const TreeEntry *E : PostponedNodes) {
16282 auto *TE =
const_cast<TreeEntry *
>(E);
16283 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16284 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16285 TE->UserTreeIndices.front().EdgeIdx)) &&
16286 VecTE->isSame(TE->Scalars))
16290 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16291 TE->VectorizedValue =
nullptr;
16293 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16302 if (isa<PHINode>(UserI)) {
16305 for (
User *U : PrevVec->users()) {
16308 auto *UI = dyn_cast<Instruction>(U);
16309 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16311 if (UI->comesBefore(InsertPt))
16320 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16325 if (Vec->
getType() != PrevVec->getType()) {
16327 PrevVec->getType()->isIntOrIntVectorTy() &&
16328 "Expected integer vector types only.");
16329 std::optional<bool> IsSigned;
16330 for (
Value *V : TE->Scalars) {
16331 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16332 auto It = MinBWs.
find(BaseTE);
16333 if (It != MinBWs.
end()) {
16334 IsSigned = IsSigned.value_or(
false) || It->second.second;
16338 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16339 auto It = MinBWs.
find(MNTE);
16340 if (It != MinBWs.
end()) {
16341 IsSigned = IsSigned.value_or(
false) || It->second.second;
16346 if (IsSigned.value_or(
false))
16349 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16350 auto It = MinBWs.
find(BVE);
16351 if (It != MinBWs.
end()) {
16352 IsSigned = IsSigned.value_or(
false) || It->second.second;
16357 if (IsSigned.value_or(
false))
16359 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16361 IsSigned.value_or(
false) ||
16365 if (IsSigned.value_or(
false))
16369 if (IsSigned.value_or(
false)) {
16371 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16372 if (It != MinBWs.
end())
16373 IsSigned = It->second.second;
16376 "Expected user node or perfect diamond match in MinBWs.");
16380 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16383 auto It = PostponedValues.
find(PrevVec);
16384 if (It != PostponedValues.
end()) {
16385 for (TreeEntry *VTE : It->getSecond())
16386 VTE->VectorizedValue = Vec;
16406 for (
const auto &ExternalUse : ExternalUses) {
16407 Value *Scalar = ExternalUse.Scalar;
16414 TreeEntry *E = getTreeEntry(Scalar);
16415 assert(E &&
"Invalid scalar");
16416 assert(!E->isGather() &&
"Extracting from a gather list");
16418 if (E->getOpcode() == Instruction::GetElementPtr &&
16419 !isa<GetElementPtrInst>(Scalar))
16422 Value *Vec = E->VectorizedValue;
16423 assert(Vec &&
"Can't find vectorizable value");
16426 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16427 if (Scalar->getType() != Vec->
getType()) {
16428 Value *Ex =
nullptr;
16429 Value *ExV =
nullptr;
16430 auto *Inst = dyn_cast<Instruction>(Scalar);
16431 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16432 auto It = ScalarToEEs.
find(Scalar);
16433 if (It != ScalarToEEs.
end()) {
16436 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16438 if (EEIt != It->second.end()) {
16439 Value *PrevV = EEIt->second.first;
16440 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16441 I && !ReplaceInst &&
16446 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16450 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16458 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16459 IgnoredExtracts.
insert(EE);
16462 auto *CloneInst = Inst->clone();
16463 CloneInst->insertBefore(Inst);
16464 if (Inst->hasName())
16468 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16469 ES && isa<Instruction>(Vec)) {
16470 Value *V = ES->getVectorOperand();
16471 auto *IVec = cast<Instruction>(Vec);
16472 if (
const TreeEntry *ETE = getTreeEntry(V))
16473 V = ETE->VectorizedValue;
16474 if (
auto *
IV = dyn_cast<Instruction>(V);
16475 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16476 IV->comesBefore(IVec))
16480 }
else if (
auto *VecTy =
16481 dyn_cast<FixedVectorType>(Scalar->getType())) {
16490 Vec, Builder.
getInt64(ExternalUse.Lane * VecTyNumElements));
16497 if (Scalar->getType() != Ex->
getType())
16499 Ex, Scalar->getType(),
16501 auto *
I = dyn_cast<Instruction>(Ex);
16503 : &
F->getEntryBlock(),
16504 std::make_pair(Ex, ExV));
16508 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16510 GatherShuffleExtractSeq.
insert(ExI);
16511 CSEBlocks.
insert(ExI->getParent());
16515 assert(isa<FixedVectorType>(Scalar->getType()) &&
16516 isa<InsertElementInst>(Scalar) &&
16517 "In-tree scalar of vector type is not insertelement?");
16518 auto *IE = cast<InsertElementInst>(Scalar);
16526 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16530 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16533 if (ExternalUsesAsOriginalScalar.contains(U))
16535 TreeEntry *UseEntry = getTreeEntry(U);
16537 (UseEntry->State == TreeEntry::Vectorize ||
16539 TreeEntry::StridedVectorize) &&
16540 (E->State == TreeEntry::Vectorize ||
16541 E->State == TreeEntry::StridedVectorize) &&
16542 doesInTreeUserNeedToExtract(
16543 Scalar, getRootEntryInstruction(*UseEntry),
16546 "Scalar with nullptr User must be registered in "
16547 "ExternallyUsedValues map or remain as scalar in vectorized "
16549 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16550 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16551 if (
PHI->getParent()->isLandingPad())
16555 PHI->getParent()->getLandingPadInst()->getIterator()));
16558 PHI->getParent()->getFirstNonPHIIt());
16561 std::next(VecI->getIterator()));
16566 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16568 if (Scalar != NewInst) {
16569 assert((!isa<ExtractElementInst>(Scalar) ||
16570 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16571 "Extractelements should not be replaced.");
16572 Scalar->replaceAllUsesWith(NewInst);
16577 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16580 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16581 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16582 if (!UsedInserts.
insert(VU).second)
16585 auto BWIt = MinBWs.
find(E);
16587 auto *ScalarTy = FTy->getElementType();
16588 auto Key = std::make_pair(Vec, ScalarTy);
16589 auto VecIt = VectorCasts.
find(Key);
16590 if (VecIt == VectorCasts.
end()) {
16592 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16593 if (IVec->getParent()->isLandingPad())
16595 std::next(IVec->getParent()
16596 ->getLandingPadInst()
16600 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16601 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16608 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16609 BWIt->second.second);
16612 Vec = VecIt->second;
16619 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16626 unsigned Idx = *InsertIdx;
16627 if (It == ShuffledInserts.
end()) {
16629 It = std::next(ShuffledInserts.
begin(),
16630 ShuffledInserts.
size() - 1);
16635 Mask[
Idx] = ExternalUse.Lane;
16636 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16645 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16647 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16648 if (PH->getIncomingValue(
I) == Scalar) {
16650 PH->getIncomingBlock(
I)->getTerminator();
16651 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16653 std::next(VecI->getIterator()));
16657 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16658 PH->setOperand(
I, NewInst);
16663 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16668 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16678 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16679 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16681 CombinedMask1[
I] = Mask[
I];
16683 CombinedMask2[
I] = Mask[
I] - VF;
16686 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16687 ShuffleBuilder.
add(V1, CombinedMask1);
16689 ShuffleBuilder.
add(V2, CombinedMask2);
16690 return ShuffleBuilder.
finalize({}, {}, {});
16694 bool ForSingleMask) {
16695 unsigned VF = Mask.size();
16696 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16698 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16699 Vec = CreateShuffle(Vec,
nullptr, Mask);
16700 return std::make_pair(Vec,
true);
16702 if (!ForSingleMask) {
16704 for (
unsigned I = 0;
I < VF; ++
I) {
16706 ResizeMask[Mask[
I]] = Mask[
I];
16708 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16712 return std::make_pair(Vec,
false);
16716 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16722 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16723 Value *NewInst = performExtractsShuffleAction<Value>(
16727 return cast<VectorType>(Vec->getType())
16728 ->getElementCount()
16729 .getKnownMinValue();
16734 assert((Vals.size() == 1 || Vals.size() == 2) &&
16735 "Expected exactly 1 or 2 input values.");
16736 if (Vals.size() == 1) {
16739 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16740 ->getNumElements() ||
16741 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16742 return CreateShuffle(Vals.front(), nullptr, Mask);
16743 return Vals.front();
16745 return CreateShuffle(Vals.
front() ? Vals.
front()
16747 Vals.
back(), Mask);
16749 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16752 if (It != ShuffledInserts[
I].InsertElements.
rend())
16755 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16756 assert(
II &&
"Must be an insertelement instruction.");
16761 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16764 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16765 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16766 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16767 II->moveAfter(NewI);
16770 LastInsert->replaceAllUsesWith(NewInst);
16772 IE->replaceUsesOfWith(IE->getOperand(0),
16774 IE->replaceUsesOfWith(IE->getOperand(1),
16778 CSEBlocks.
insert(LastInsert->getParent());
16783 for (
auto &TEPtr : VectorizableTree) {
16784 TreeEntry *Entry = TEPtr.get();
16787 if (Entry->isGather())
16790 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16793 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16794 Value *Scalar = Entry->Scalars[Lane];
16796 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16797 !isa<GetElementPtrInst>(Scalar))
16799 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16800 EE && IgnoredExtracts.contains(EE))
16802 if (isa<PoisonValue>(Scalar))
16805 Type *Ty = Scalar->getType();
16807 for (
User *U : Scalar->users()) {
16811 assert((getTreeEntry(U) ||
16812 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16813 (isa_and_nonnull<Instruction>(U) &&
16814 isDeleted(cast<Instruction>(U)))) &&
16815 "Deleting out-of-tree value");
16819 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16820 auto *
I = cast<Instruction>(Scalar);
16827 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16828 V->mergeDIAssignID(RemovedInsts);
16831 if (UserIgnoreList) {
16833 const TreeEntry *
IE = getTreeEntry(
I);
16834 if (
IE->Idx != 0 &&
16835 !(VectorizableTree.front()->isGather() &&
16836 !
IE->UserTreeIndices.empty() &&
16837 (ValueToGatherNodes.lookup(
I).contains(
16838 VectorizableTree.front().get()) ||
16840 [&](
const EdgeInfo &EI) {
16841 return EI.UserTE == VectorizableTree.front().get() &&
16842 EI.EdgeIdx == UINT_MAX;
16844 !(GatheredLoadsEntriesFirst.has_value() &&
16845 IE->Idx >= *GatheredLoadsEntriesFirst &&
16846 VectorizableTree.front()->isGather() &&
16852 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16853 (match(U.getUser(), m_LogicalAnd()) ||
16854 match(U.getUser(), m_LogicalOr())) &&
16855 U.getOperandNo() == 0;
16856 if (IsPoisoningLogicalOp) {
16857 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16860 return UserIgnoreList->contains(
U.getUser());
16872 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16875 InstrElementSize.
clear();
16877 const TreeEntry &RootTE = *VectorizableTree.front();
16878 Value *Vec = RootTE.VectorizedValue;
16879 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16880 It != MinBWs.end() &&
16881 ReductionBitWidth != It->second.first) {
16884 ReductionRoot->getIterator());
16888 cast<VectorType>(Vec->
getType())->getElementCount()),
16889 It->second.second);
16896 <<
" gather sequences instructions.\n");
16903 Loop *L = LI->getLoopFor(
I->getParent());
16908 BasicBlock *PreHeader = L->getLoopPreheader();
16916 auto *OpI = dyn_cast<Instruction>(V);
16917 return OpI && L->contains(OpI);
16923 CSEBlocks.
insert(PreHeader);
16938 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
16939 "Different nodes should have different DFS numbers");
16940 return A->getDFSNumIn() <
B->getDFSNumIn();
16950 if (I1->getType() != I2->getType())
16952 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16953 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16955 return I1->isIdenticalTo(I2);
16956 if (SI1->isIdenticalTo(SI2))
16958 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
16959 if (SI1->getOperand(
I) != SI2->getOperand(
I))
16962 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16966 unsigned LastUndefsCnt = 0;
16967 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
16973 NewMask[
I] != SM1[
I])
16976 NewMask[
I] = SM1[
I];
16980 return SM1.
size() - LastUndefsCnt > 1 &&
16984 SM1.
size() - LastUndefsCnt));
16990 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
16993 "Worklist not sorted properly!");
16999 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17000 !GatherShuffleExtractSeq.contains(&In))
17005 bool Replaced =
false;
17008 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17009 DT->
dominates(V->getParent(), In.getParent())) {
17010 In.replaceAllUsesWith(V);
17012 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17013 if (!NewMask.
empty())
17014 SI->setShuffleMask(NewMask);
17018 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17019 GatherShuffleExtractSeq.contains(V) &&
17020 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17021 DT->
dominates(In.getParent(), V->getParent())) {
17023 V->replaceAllUsesWith(&In);
17025 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17026 if (!NewMask.
empty())
17027 SI->setShuffleMask(NewMask);
17035 Visited.push_back(&In);
17040 GatherShuffleExtractSeq.clear();
17043BoUpSLP::ScheduleData *
17045 ScheduleData *Bundle =
nullptr;
17046 ScheduleData *PrevInBundle =
nullptr;
17047 for (
Value *V : VL) {
17050 ScheduleData *BundleMember = getScheduleData(V);
17052 "no ScheduleData for bundle member "
17053 "(maybe not in same basic block)");
17054 assert(BundleMember->isSchedulingEntity() &&
17055 "bundle member already part of other bundle");
17056 if (PrevInBundle) {
17057 PrevInBundle->NextInBundle = BundleMember;
17059 Bundle = BundleMember;
17063 BundleMember->FirstInBundle = Bundle;
17064 PrevInBundle = BundleMember;
17066 assert(Bundle &&
"Failed to find schedule bundle");
17072std::optional<BoUpSLP::ScheduleData *>
17074 const InstructionsState &S) {
17077 if (isa<PHINode>(S.getMainOp()) ||
17083 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17085 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17086 ScheduleData *Bundle) {
17092 if (ScheduleEnd != OldScheduleEnd) {
17093 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17094 if (ScheduleData *SD = getScheduleData(
I))
17095 SD->clearDependencies();
17100 <<
" in block " << BB->
getName() <<
"\n");
17101 calculateDependencies(Bundle,
true, SLP);
17106 initialFillReadyList(ReadyInsts);
17113 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17114 !ReadyInsts.empty()) {
17115 ScheduleData *Picked = ReadyInsts.pop_back_val();
17116 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17117 "must be ready to schedule");
17118 schedule(Picked, ReadyInsts);
17124 for (
Value *V : VL) {
17127 if (!extendSchedulingRegion(V, S)) {
17134 TryScheduleBundleImpl(
false,
nullptr);
17135 return std::nullopt;
17139 bool ReSchedule =
false;
17140 for (
Value *V : VL) {
17143 ScheduleData *BundleMember = getScheduleData(V);
17145 "no ScheduleData for bundle member (maybe not in same basic block)");
17149 ReadyInsts.remove(BundleMember);
17151 if (!BundleMember->IsScheduled)
17156 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17157 <<
" was already scheduled\n");
17161 auto *Bundle = buildBundle(VL);
17162 TryScheduleBundleImpl(ReSchedule, Bundle);
17163 if (!Bundle->isReady()) {
17164 cancelScheduling(VL, S.getMainOp());
17165 return std::nullopt;
17178 ScheduleData *Bundle = getScheduleData(OpValue);
17179 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17180 assert(!Bundle->IsScheduled &&
17181 "Can't cancel bundle which is already scheduled");
17182 assert(Bundle->isSchedulingEntity() &&
17184 "tried to unbundle something which is not a bundle");
17187 if (Bundle->isReady())
17188 ReadyInsts.remove(Bundle);
17191 ScheduleData *BundleMember = Bundle;
17192 while (BundleMember) {
17193 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17194 BundleMember->FirstInBundle = BundleMember;
17195 ScheduleData *Next = BundleMember->NextInBundle;
17196 BundleMember->NextInBundle =
nullptr;
17197 BundleMember->TE =
nullptr;
17198 if (BundleMember->unscheduledDepsInBundle() == 0) {
17199 ReadyInsts.insert(BundleMember);
17201 BundleMember = Next;
17205BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17207 if (ChunkPos >= ChunkSize) {
17208 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17211 return &(ScheduleDataChunks.back()[ChunkPos++]);
17214bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17215 Value *V,
const InstructionsState &S) {
17217 assert(
I &&
"bundle member must be an instruction");
17220 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17222 if (getScheduleData(
I))
17224 if (!ScheduleStart) {
17226 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17228 ScheduleEnd =
I->getNextNode();
17229 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17230 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17238 ++ScheduleStart->getIterator().getReverse();
17243 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17244 return II->isAssumeLikeIntrinsic();
17247 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17248 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17249 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17251 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17252 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17259 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17260 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17262 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17263 assert(
I->getParent() == ScheduleStart->getParent() &&
17264 "Instruction is in wrong basic block.");
17265 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17271 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17272 "Expected to reach top of the basic block or instruction down the "
17274 assert(
I->getParent() == ScheduleEnd->getParent() &&
17275 "Instruction is in wrong basic block.");
17276 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17278 ScheduleEnd =
I->getNextNode();
17279 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17280 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17284void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17286 ScheduleData *PrevLoadStore,
17287 ScheduleData *NextLoadStore) {
17288 ScheduleData *CurrentLoadStore = PrevLoadStore;
17293 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17295 SD = allocateScheduleDataChunks();
17296 ScheduleDataMap[
I] = SD;
17298 assert(!isInSchedulingRegion(SD) &&
17299 "new ScheduleData already in scheduling region");
17300 SD->init(SchedulingRegionID,
I);
17302 if (
I->mayReadOrWriteMemory() &&
17303 (!isa<IntrinsicInst>(
I) ||
17304 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17305 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17306 Intrinsic::pseudoprobe))) {
17308 if (CurrentLoadStore) {
17309 CurrentLoadStore->NextLoadStore = SD;
17311 FirstLoadStoreInRegion = SD;
17313 CurrentLoadStore = SD;
17316 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17317 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17318 RegionHasStackSave =
true;
17320 if (NextLoadStore) {
17321 if (CurrentLoadStore)
17322 CurrentLoadStore->NextLoadStore = NextLoadStore;
17324 LastLoadStoreInRegion = CurrentLoadStore;
17328void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17329 bool InsertInReadyList,
17331 assert(SD->isSchedulingEntity());
17336 while (!WorkList.
empty()) {
17338 for (ScheduleData *BundleMember = SD; BundleMember;
17339 BundleMember = BundleMember->NextInBundle) {
17340 assert(isInSchedulingRegion(BundleMember));
17341 if (BundleMember->hasValidDependencies())
17346 BundleMember->Dependencies = 0;
17347 BundleMember->resetUnscheduledDeps();
17350 for (
User *U : BundleMember->Inst->
users()) {
17351 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17352 BundleMember->Dependencies++;
17353 ScheduleData *DestBundle = UseSD->FirstInBundle;
17354 if (!DestBundle->IsScheduled)
17355 BundleMember->incrementUnscheduledDeps(1);
17356 if (!DestBundle->hasValidDependencies())
17362 auto *DepDest = getScheduleData(
I);
17363 assert(DepDest &&
"must be in schedule window");
17364 DepDest->ControlDependencies.push_back(BundleMember);
17365 BundleMember->Dependencies++;
17366 ScheduleData *DestBundle = DepDest->FirstInBundle;
17367 if (!DestBundle->IsScheduled)
17368 BundleMember->incrementUnscheduledDeps(1);
17369 if (!DestBundle->hasValidDependencies())
17377 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17378 I != ScheduleEnd;
I =
I->getNextNode()) {
17383 MakeControlDependent(
I);
17391 if (RegionHasStackSave) {
17395 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17396 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17397 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17398 I != ScheduleEnd;
I =
I->getNextNode()) {
17399 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17400 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17405 if (!isa<AllocaInst>(
I))
17409 MakeControlDependent(
I);
17418 if (isa<AllocaInst>(BundleMember->Inst) ||
17419 BundleMember->Inst->mayReadOrWriteMemory()) {
17420 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17421 I != ScheduleEnd;
I =
I->getNextNode()) {
17422 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17423 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17427 MakeControlDependent(
I);
17434 ScheduleData *DepDest = BundleMember->NextLoadStore;
17439 "NextLoadStore list for non memory effecting bundle?");
17441 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17442 unsigned NumAliased = 0;
17443 unsigned DistToSrc = 1;
17445 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17446 assert(isInSchedulingRegion(DepDest));
17456 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17458 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17465 DepDest->MemoryDependencies.push_back(BundleMember);
17466 BundleMember->Dependencies++;
17467 ScheduleData *DestBundle = DepDest->FirstInBundle;
17468 if (!DestBundle->IsScheduled) {
17469 BundleMember->incrementUnscheduledDeps(1);
17471 if (!DestBundle->hasValidDependencies()) {
17494 if (InsertInReadyList && SD->isReady()) {
17495 ReadyInsts.insert(SD);
17502void BoUpSLP::BlockScheduling::resetSchedule() {
17504 "tried to reset schedule on block which has not been scheduled");
17505 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17506 if (ScheduleData *SD = getScheduleData(
I)) {
17507 assert(isInSchedulingRegion(SD) &&
17508 "ScheduleData not in scheduling region");
17509 SD->IsScheduled =
false;
17510 SD->resetUnscheduledDeps();
17513 ReadyInsts.clear();
17516void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17517 if (!BS->ScheduleStart)
17520 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17527 BS->resetSchedule();
17534 struct ScheduleDataCompare {
17535 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17536 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17539 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17544 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17545 I =
I->getNextNode()) {
17546 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17547 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17550 SD->isPartOfBundle() ==
17552 "scheduler and vectorizer bundle mismatch");
17553 SD->FirstInBundle->SchedulingPriority =
Idx++;
17555 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17556 BS->calculateDependencies(SD,
false,
this);
17559 BS->initialFillReadyList(ReadyInsts);
17561 Instruction *LastScheduledInst = BS->ScheduleEnd;
17564 while (!ReadyInsts.empty()) {
17565 ScheduleData *Picked = *ReadyInsts.begin();
17566 ReadyInsts.erase(ReadyInsts.begin());
17570 for (ScheduleData *BundleMember = Picked; BundleMember;
17571 BundleMember = BundleMember->NextInBundle) {
17575 LastScheduledInst = PickedInst;
17578 BS->schedule(Picked, ReadyInsts);
17582#ifdef EXPENSIVE_CHECKS
17586#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17588 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17589 ScheduleData *SD = BS->getScheduleData(
I);
17590 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17591 assert(SD->IsScheduled &&
"must be scheduled at this point");
17596 BS->ScheduleStart =
nullptr;
17603 if (
auto *Store = dyn_cast<StoreInst>(V))
17604 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17606 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17609 auto E = InstrElementSize.
find(V);
17610 if (E != InstrElementSize.
end())
17619 if (
auto *
I = dyn_cast<Instruction>(V)) {
17627 Value *FirstNonBool =
nullptr;
17628 while (!Worklist.
empty()) {
17633 auto *Ty =
I->getType();
17634 if (isa<VectorType>(Ty))
17636 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17643 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17644 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17652 for (
Use &U :
I->operands()) {
17653 if (
auto *J = dyn_cast<Instruction>(U.get()))
17654 if (Visited.
insert(J).second &&
17655 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17659 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17660 FirstNonBool = U.get();
17671 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17673 Width =
DL->getTypeSizeInBits(V->getType());
17677 InstrElementSize[
I] = Width;
17682bool BoUpSLP::collectValuesToDemote(
17683 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17686 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17688 if (
all_of(E.Scalars, IsaPred<Constant>))
17691 unsigned OrigBitWidth =
17692 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17699 if (NodesToKeepBWs.
contains(E.Idx))
17705 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17706 if (isa<PoisonValue>(R))
17708 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17710 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17711 if (isa<PoisonValue>(V))
17719 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17725 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17728 if (
auto *
I = dyn_cast<Instruction>(V)) {
17730 unsigned BitWidth2 =
17731 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17732 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17738 BitWidth1 = std::min(BitWidth1, BitWidth2);
17743 using namespace std::placeholders;
17744 auto FinalAnalysis = [&]() {
17745 if (!IsProfitableToDemote)
17748 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17750 if (Res && E.isGather()) {
17754 for (
Value *V : E.Scalars) {
17755 auto *EE = dyn_cast<ExtractElementInst>(V);
17758 UniqueBases.
insert(EE->getVectorOperand());
17760 const unsigned VF = E.Scalars.size();
17761 Type *OrigScalarTy = E.Scalars.front()->getType();
17762 if (UniqueBases.
size() <= 2 ||
17770 if (E.isGather() || !Visited.
insert(&E).second ||
17772 return all_of(V->users(), [&](User *U) {
17773 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17776 return FinalAnalysis();
17779 return !all_of(V->users(), [=](User *U) {
17780 return getTreeEntry(U) ||
17781 (E.Idx == 0 && UserIgnoreList &&
17782 UserIgnoreList->contains(U)) ||
17783 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17784 !U->getType()->isScalableTy() &&
17785 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17786 }) && !IsPotentiallyTruncated(V,
BitWidth);
17791 bool &NeedToExit) {
17792 NeedToExit =
false;
17793 unsigned InitLevel = MaxDepthLevel;
17795 unsigned Level = InitLevel;
17796 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17797 ToDemote, Visited, NodesToKeepBWs, Level,
17798 IsProfitableToDemote, IsTruncRoot)) {
17799 if (!IsProfitableToDemote)
17802 if (!FinalAnalysis())
17806 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17810 auto AttemptCheckBitwidth =
17813 NeedToExit =
false;
17814 unsigned BestFailBitwidth = 0;
17816 if (Checker(
BitWidth, OrigBitWidth))
17818 if (BestFailBitwidth == 0 && FinalAnalysis())
17822 if (BestFailBitwidth == 0) {
17833 auto TryProcessInstruction =
17839 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17844 if (E.UserTreeIndices.size() > 1 &&
17845 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17848 bool NeedToExit =
false;
17849 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17853 if (!ProcessOperands(
Operands, NeedToExit))
17862 return IsProfitableToDemote;
17864 switch (E.getOpcode()) {
17868 case Instruction::Trunc:
17869 if (IsProfitableToDemoteRoot)
17870 IsProfitableToDemote =
true;
17871 return TryProcessInstruction(
BitWidth);
17872 case Instruction::ZExt:
17873 case Instruction::SExt:
17874 IsProfitableToDemote =
true;
17875 return TryProcessInstruction(
BitWidth);
17879 case Instruction::Add:
17880 case Instruction::Sub:
17881 case Instruction::Mul:
17882 case Instruction::And:
17883 case Instruction::Or:
17884 case Instruction::Xor: {
17885 return TryProcessInstruction(
17886 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17888 case Instruction::Freeze:
17889 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17890 case Instruction::Shl: {
17895 if (isa<PoisonValue>(V))
17897 auto *I = cast<Instruction>(V);
17898 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17899 return AmtKnownBits.getMaxValue().ult(BitWidth);
17902 return TryProcessInstruction(
17903 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17905 case Instruction::LShr: {
17909 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17911 if (isa<PoisonValue>(V))
17913 auto *I = cast<Instruction>(V);
17914 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17915 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17916 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17917 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17918 SimplifyQuery(*DL));
17921 return TryProcessInstruction(
17922 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17925 case Instruction::AShr: {
17929 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17931 if (isa<PoisonValue>(V))
17933 auto *I = cast<Instruction>(V);
17934 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17935 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17936 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17937 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17941 return TryProcessInstruction(
17942 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17945 case Instruction::UDiv:
17946 case Instruction::URem: {
17948 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17951 auto *I = cast<Instruction>(V);
17952 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17953 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17954 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17957 return TryProcessInstruction(
17958 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17962 case Instruction::Select: {
17963 return TryProcessInstruction(
17964 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
17969 case Instruction::PHI: {
17970 const unsigned NumOps = E.getNumOperands();
17973 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
17975 return TryProcessInstruction(
BitWidth, Ops);
17978 case Instruction::Call: {
17979 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
17983 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
17984 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
17988 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17991 auto *I = cast<Instruction>(V);
17992 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
17993 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17994 return MaskedValueIsZero(I->getOperand(0), Mask,
17995 SimplifyQuery(*DL)) &&
17996 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17998 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
17999 "Expected min/max intrinsics only.");
18000 unsigned SignBits = OrigBitWidth -
BitWidth;
18006 return SignBits <= Op0SignBits &&
18007 ((SignBits != Op0SignBits &&
18011 SignBits <= Op1SignBits &&
18012 ((SignBits != Op1SignBits &&
18017 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18020 auto *I = cast<Instruction>(V);
18021 unsigned SignBits = OrigBitWidth - BitWidth;
18022 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18023 unsigned Op0SignBits =
18024 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18025 return SignBits <= Op0SignBits &&
18026 ((SignBits != Op0SignBits &&
18027 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18028 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18031 if (
ID != Intrinsic::abs) {
18032 Operands.push_back(getOperandEntry(&E, 1));
18033 CallChecker = CompChecker;
18035 CallChecker = AbsChecker;
18038 std::numeric_limits<InstructionCost::CostType>::max();
18040 unsigned VF = E.Scalars.size();
18050 if (
Cost < BestCost) {
18056 [[maybe_unused]]
bool NeedToExit;
18057 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18067 return FinalAnalysis();
18074 bool IsStoreOrInsertElt =
18075 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18076 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18077 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18078 ExtraBitWidthNodes.
size() <= 1 &&
18079 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18080 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18083 unsigned NodeIdx = 0;
18084 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18088 if (VectorizableTree[NodeIdx]->
isGather() ||
18089 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18090 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18092 return EI.
UserTE->Idx > NodeIdx;
18098 bool IsTruncRoot =
false;
18099 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18102 if (NodeIdx != 0 &&
18103 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18104 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18105 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18106 IsTruncRoot =
true;
18108 IsProfitableToDemoteRoot =
true;
18113 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18117 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
18118 bool IsProfitableToDemoteRoot,
unsigned Opcode,
18119 unsigned Limit,
bool IsTruncRoot,
18120 bool IsSignedCmp) ->
unsigned {
18124 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18125 !NodesToKeepBWs.
contains(E.Idx) &&
18126 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18128 return V->hasOneUse() || isa<Constant>(V) ||
18131 const TreeEntry *TE = getTreeEntry(U);
18132 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18133 if (TE == UserTE || !TE)
18135 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18137 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18138 SelectInst>(UserTE->getMainOp()))
18140 unsigned UserTESz = DL->getTypeSizeInBits(
18141 UserTE->Scalars.front()->getType());
18142 auto It = MinBWs.find(TE);
18143 if (It != MinBWs.end() && It->second.first > UserTESz)
18145 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18149 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18150 auto It = MinBWs.
find(UserTE);
18151 if (It != MinBWs.
end())
18152 return It->second.first;
18153 unsigned MaxBitWidth =
18154 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18155 MaxBitWidth =
bit_ceil(MaxBitWidth);
18156 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18158 return MaxBitWidth;
18161 unsigned VF = E.getVectorFactor();
18162 Type *ScalarTy = E.Scalars.front()->getType();
18164 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18165 if (!TreeRootIT || !Opcode)
18169 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18178 unsigned MaxBitWidth = 1u;
18186 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18187 if (isa<PoisonValue>(R))
18189 KnownBits Known = computeKnownBits(R, *DL);
18190 return Known.isNonNegative();
18195 for (
Value *Root : E.Scalars) {
18196 if (isa<PoisonValue>(Root))
18201 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18217 if (!IsKnownPositive)
18221 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18223 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18226 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18231 if (NumParts > 1 &&
18237 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18238 Opcode == Instruction::SExt ||
18239 Opcode == Instruction::ZExt || NumParts > 1;
18244 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18245 bool NeedToDemote = IsProfitableToDemote;
18247 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18248 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18249 NeedToDemote, IsTruncRoot) ||
18250 (MaxDepthLevel <= Limit &&
18251 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18252 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18253 DL->getTypeSizeInBits(TreeRootIT) /
18254 DL->getTypeSizeInBits(
18255 E.getMainOp()->getOperand(0)->getType()) >
18259 MaxBitWidth =
bit_ceil(MaxBitWidth);
18261 return MaxBitWidth;
18268 if (UserIgnoreList &&
18269 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18272 if (
all_of(*UserIgnoreList,
18274 return isa<PoisonValue>(V) ||
18275 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18277 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18278 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18279 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18281 ReductionBitWidth = 1;
18283 for (
Value *V : *UserIgnoreList) {
18284 if (isa<PoisonValue>(V))
18287 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18288 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18291 unsigned BitWidth2 = BitWidth1;
18294 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18296 ReductionBitWidth =
18297 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18299 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18300 ReductionBitWidth = 8;
18302 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18305 bool IsTopRoot = NodeIdx == 0;
18306 while (NodeIdx < VectorizableTree.size() &&
18307 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18308 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18311 IsTruncRoot =
true;
18313 bool IsSignedCmp =
false;
18314 while (NodeIdx < VectorizableTree.size()) {
18316 unsigned Limit = 2;
18317 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18319 ReductionBitWidth ==
18320 DL->getTypeSizeInBits(
18321 VectorizableTree.front()->Scalars.front()->getType()))
18323 unsigned MaxBitWidth = ComputeMaxBitWidth(
18324 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18325 Limit, IsTruncRoot, IsSignedCmp);
18326 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18327 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18328 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18329 else if (MaxBitWidth == 0)
18330 ReductionBitWidth = 0;
18333 for (
unsigned Idx : RootDemotes) {
18336 DL->getTypeSizeInBits(V->getType()->getScalarType());
18337 if (OrigBitWidth > MaxBitWidth) {
18345 RootDemotes.clear();
18347 IsProfitableToDemoteRoot =
true;
18349 if (ExtraBitWidthNodes.
empty()) {
18350 NodeIdx = VectorizableTree.size();
18352 unsigned NewIdx = 0;
18354 NewIdx = *ExtraBitWidthNodes.
begin();
18355 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18356 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18359 NodeIdx < VectorizableTree.size() &&
18360 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18363 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18364 !EI.
UserTE->isAltShuffle();
18367 NodeIdx < VectorizableTree.size() &&
18368 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18370 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
18372 auto *IC = dyn_cast<ICmpInst>(V);
18375 !isKnownNonNegative(IC->getOperand(0),
18376 SimplifyQuery(*DL)) ||
18377 !isKnownNonNegative(IC->getOperand(1),
18378 SimplifyQuery(*DL)));
18385 if (MaxBitWidth == 0 ||
18387 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18389 if (UserIgnoreList)
18397 for (
unsigned Idx : ToDemote) {
18398 TreeEntry *TE = VectorizableTree[
Idx].get();
18401 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18402 if (isa<PoisonValue>(R))
18404 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18422 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18447 DL = &
F.getDataLayout();
18451 bool Changed =
false;
18457 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18462 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18465 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18469 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18478 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18483 R.clearReductionData();
18484 collectSeedInstructions(BB);
18487 if (!Stores.
empty()) {
18489 <<
" underlying objects.\n");
18490 Changed |= vectorizeStoreChains(R);
18494 Changed |= vectorizeChainsInBlock(BB, R);
18499 if (!GEPs.
empty()) {
18501 <<
" underlying objects.\n");
18502 Changed |= vectorizeGEPIndices(BB, R);
18507 R.optimizeGatherSequence();
18515 unsigned Idx,
unsigned MinVF,
18520 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18521 unsigned VF = Chain.
size();
18525 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18527 VF < 2 || VF < MinVF) {
18539 for (
Value *V : Chain)
18540 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18543 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18545 bool IsAllowedSize =
18549 if ((!IsAllowedSize && S.getOpcode() &&
18550 S.getOpcode() != Instruction::Load &&
18551 (!S.getMainOp()->isSafeToRemove() ||
18554 return !isa<ExtractElementInst>(V) &&
18555 (V->getNumUses() > Chain.size() ||
18556 any_of(V->users(), [&](User *U) {
18557 return !Stores.contains(U);
18560 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
18561 Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
18565 if (
R.isLoadCombineCandidate(Chain))
18567 R.buildTree(Chain);
18569 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18570 if (
R.isGathered(Chain.front()) ||
18571 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18572 return std::nullopt;
18573 Size =
R.getCanonicalGraphSize();
18576 R.reorderTopToBottom();
18577 R.reorderBottomToTop();
18578 R.transformNodes();
18579 R.buildExternalUses();
18581 R.computeMinimumValueSizes();
18583 Size =
R.getCanonicalGraphSize();
18584 if (S.getOpcode() == Instruction::Load)
18592 using namespace ore;
18595 cast<StoreInst>(Chain[0]))
18596 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18597 <<
" and with tree size "
18598 <<
NV(
"TreeSize",
R.getTreeSize()));
18612 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18613 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18614 unsigned Size = First ? Val.first : Val.second;
18626 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18627 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18628 unsigned P = First ? Val.first : Val.second;
18631 return V + (P - Mean) * (P - Mean);
18634 return Dev * 81 / (Mean * Mean) == 0;
18637bool SLPVectorizerPass::vectorizeStores(
18639 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18644 bool Changed =
false;
18646 struct StoreDistCompare {
18647 bool operator()(
const std::pair<unsigned, int> &Op1,
18648 const std::pair<unsigned, int> &Op2)
const {
18649 return Op1.second < Op2.second;
18654 using StoreIndexToDistSet =
18655 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18656 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18661 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18663 PrevDist =
Data.second;
18664 if (
Idx !=
Set.size() - 1)
18669 Operands.push_back(Stores[DataVar.first]);
18670 PrevDist = DataVar.second;
18675 .
insert({Operands.front(),
18676 cast<StoreInst>(Operands.front())->getValueOperand(),
18678 cast<StoreInst>(Operands.back())->getValueOperand(),
18683 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18684 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18688 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18690 Type *StoreTy =
Store->getValueOperand()->getType();
18691 Type *ValueTy = StoreTy;
18692 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18693 ValueTy = Trunc->getSrcTy();
18694 unsigned MinVF = std::max<unsigned>(
18696 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18699 if (MaxVF < MinVF) {
18700 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18702 <<
"MinVF (" << MinVF <<
")\n");
18706 unsigned NonPowerOf2VF = 0;
18711 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18713 NonPowerOf2VF = CandVF;
18714 assert(NonPowerOf2VF != MaxVF &&
18715 "Non-power-of-2 VF should not be equal to MaxVF");
18719 unsigned MaxRegVF = MaxVF;
18721 if (MaxVF < MinVF) {
18722 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18724 <<
"MinVF (" << MinVF <<
")\n");
18730 unsigned Size = MinVF;
18732 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18736 unsigned Repeat = 0;
18737 constexpr unsigned MaxAttempts = 4;
18739 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18740 P.first =
P.second = 1;
18743 auto IsNotVectorized = [](
bool First,
18744 const std::pair<unsigned, unsigned> &
P) {
18745 return First ?
P.first > 0 :
P.second > 0;
18747 auto IsVectorized = [](
bool First,
18748 const std::pair<unsigned, unsigned> &
P) {
18749 return First ?
P.first == 0 :
P.second == 0;
18751 auto VFIsProfitable = [](
bool First,
unsigned Size,
18752 const std::pair<unsigned, unsigned> &
P) {
18755 auto FirstSizeSame = [](
unsigned Size,
18756 const std::pair<unsigned, unsigned> &
P) {
18757 return Size ==
P.first;
18761 bool RepeatChanged =
false;
18762 bool AnyProfitableGraph =
false;
18763 for (
unsigned Size : CandidateVFs) {
18764 AnyProfitableGraph =
false;
18765 unsigned StartIdx = std::distance(
18766 RangeSizes.begin(),
18767 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18768 std::placeholders::_1)));
18769 while (StartIdx <
End) {
18771 std::distance(RangeSizes.begin(),
18772 find_if(RangeSizes.drop_front(StartIdx),
18773 std::bind(IsVectorized,
Size >= MaxRegVF,
18774 std::placeholders::_1)));
18775 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18776 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18778 Size >= MaxRegVF)) {
18785 return cast<StoreInst>(V)
18786 ->getValueOperand()
18788 cast<StoreInst>(Slice.
front())
18789 ->getValueOperand()
18792 "Expected all operands of same type.");
18793 if (!NonSchedulable.empty()) {
18794 auto [NonSchedSizeMax, NonSchedSizeMin] =
18795 NonSchedulable.lookup(Slice.
front());
18796 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18797 Cnt += NonSchedSizeMax;
18802 std::optional<bool> Res =
18803 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18807 .first->getSecond()
18815 AnyProfitableGraph = RepeatChanged = Changed =
true;
18819 [](std::pair<unsigned, unsigned> &
P) {
18820 P.first = P.second = 0;
18822 if (Cnt < StartIdx + MinVF) {
18823 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18824 [](std::pair<unsigned, unsigned> &
P) {
18825 P.first = P.second = 0;
18827 StartIdx = Cnt +
Size;
18829 if (Cnt > Sz -
Size - MinVF) {
18831 [](std::pair<unsigned, unsigned> &
P) {
18832 P.first = P.second = 0;
18841 if (
Size > 2 && Res &&
18843 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18844 std::placeholders::_1))) {
18850 if (
Size > MaxRegVF && TreeSize > 1 &&
18852 std::bind(FirstSizeSame, TreeSize,
18853 std::placeholders::_1))) {
18855 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18861 [&](std::pair<unsigned, unsigned> &
P) {
18862 if (Size >= MaxRegVF)
18863 P.second = std::max(P.second, TreeSize);
18865 P.first = std::max(P.first, TreeSize);
18868 AnyProfitableGraph =
true;
18870 if (StartIdx >=
End)
18872 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18873 AnyProfitableGraph =
true;
18874 StartIdx = std::distance(
18875 RangeSizes.begin(),
18876 find_if(RangeSizes.drop_front(Sz),
18877 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18878 std::placeholders::_1)));
18884 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18885 return P.first == 0 &&
P.second == 0;
18889 if (Repeat >= MaxAttempts ||
18890 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18892 constexpr unsigned StoresLimit = 64;
18893 const unsigned MaxTotalNum = std::min<unsigned>(
18895 static_cast<unsigned>(
18898 RangeSizes.begin(),
18899 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18900 std::placeholders::_1))) +
18902 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18905 CandidateVFs.clear();
18907 CandidateVFs.push_back(Limit);
18908 if (VF > MaxTotalNum || VF >= StoresLimit)
18910 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18912 P.first = std::max(
P.second,
P.first);
18916 CandidateVFs.push_back(VF);
18963 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
18965 Stores[
Set.first]->getValueOperand()->getType(),
18966 Stores[
Set.first]->getPointerOperand(),
18967 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
18971 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
18972 if (It ==
Set.second.end()) {
18973 Set.second.emplace(
Idx, *Diff);
18977 TryToVectorize(
Set.second);
18978 unsigned ItIdx = It->first;
18979 int ItDist = It->second;
18980 StoreIndexToDistSet PrevSet;
18981 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
18982 [&](
const std::pair<unsigned, int> &Pair) {
18983 return Pair.first > ItIdx;
18985 Set.second.clear();
18987 Set.second.emplace(
Idx, 0);
18990 unsigned StartIdx = ItIdx + 1;
18995 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
18997 if (VectorizedStores.
contains(Stores[Pair.first]))
18999 unsigned BI = Pair.first - StartIdx;
19000 UsedStores.set(BI);
19001 Dists[BI] = Pair.second - ItDist;
19003 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19004 unsigned BI =
I - StartIdx;
19005 if (UsedStores.test(BI))
19006 Set.second.emplace(
I, Dists[BI]);
19010 auto &Res = SortedStores.emplace_back();
19012 Res.second.emplace(
Idx, 0);
19014 Type *PrevValTy =
nullptr;
19016 if (
R.isDeleted(SI))
19019 PrevValTy =
SI->getValueOperand()->getType();
19021 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19022 for (
auto &Set : SortedStores)
19023 TryToVectorize(
Set.second);
19024 SortedStores.clear();
19025 PrevValTy =
SI->getValueOperand()->getType();
19027 FillStoresSet(
I, SI);
19031 for (
auto &Set : SortedStores)
19032 TryToVectorize(
Set.second);
19037void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19048 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19049 if (!
SI->isSimple())
19059 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19060 if (
GEP->getNumIndices() != 1)
19063 if (isa<Constant>(
Idx))
19067 if (
GEP->getType()->isVectorTy())
19079 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19080 << VL.
size() <<
".\n");
19085 if (!S.getOpcode())
19091 for (
Value *V : VL) {
19092 Type *Ty =
V->getType();
19096 R.getORE()->emit([&]() {
19097 std::string TypeStr;
19101 <<
"Cannot SLP vectorize list: type "
19102 << TypeStr +
" is unsupported by vectorizer";
19108 unsigned Sz =
R.getVectorElementSize(I0);
19109 unsigned MinVF =
R.getMinVF(Sz);
19110 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
19111 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19113 R.getORE()->emit([&]() {
19115 <<
"Cannot SLP vectorize list: vectorization factor "
19116 <<
"less than 2 is not supported";
19121 bool Changed =
false;
19122 bool CandidateFound =
false;
19126 unsigned NextInst = 0, MaxInst = VL.size();
19127 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19134 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19135 unsigned ActualVF = std::min(MaxInst -
I, VF);
19140 if (MaxVFOnly && ActualVF < MaxVF)
19142 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19147 for (
Value *V : VL.drop_front(
I)) {
19150 if (
auto *Inst = dyn_cast<Instruction>(V);
19151 !Inst || !
R.isDeleted(Inst)) {
19154 if (
Idx == ActualVF)
19159 if (
Idx != ActualVF)
19162 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19166 if (
R.isTreeTinyAndNotFullyVectorizable())
19168 R.reorderTopToBottom();
19169 R.reorderBottomToTop(
19170 !isa<InsertElementInst>(Ops.
front()) &&
19171 !
R.doesRootHaveInTreeUses());
19172 R.transformNodes();
19173 R.buildExternalUses();
19175 R.computeMinimumValueSizes();
19177 CandidateFound =
true;
19178 MinCost = std::min(MinCost,
Cost);
19181 <<
" for VF=" << ActualVF <<
"\n");
19185 cast<Instruction>(Ops[0]))
19186 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19187 <<
" and with tree size "
19188 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19199 if (!Changed && CandidateFound) {
19200 R.getORE()->emit([&]() {
19202 <<
"List vectorization was possible but not beneficial with cost "
19203 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19206 }
else if (!Changed) {
19207 R.getORE()->emit([&]() {
19209 <<
"Cannot SLP vectorize list: vectorization was impossible"
19210 <<
" with available vectorization factors";
19220 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19226 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19227 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19228 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19229 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19236 auto *
A = dyn_cast<BinaryOperator>(Op0);
19237 auto *
B = dyn_cast<BinaryOperator>(Op1);
19239 if (
A &&
B &&
B->hasOneUse()) {
19240 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19241 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19242 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19244 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19248 if (
B &&
A &&
A->hasOneUse()) {
19249 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19250 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19251 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19253 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19257 if (Candidates.
size() == 1)
19258 return tryToVectorizeList({Op0, Op1},
R);
19261 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19262 if (!BestCandidate)
19264 return tryToVectorizeList(
19265 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19299 ReductionOpsListType ReductionOps;
19309 bool IsSupportedHorRdxIdentityOp =
false;
19320 return isa<SelectInst>(
I) &&
19326 if (Kind == RecurKind::None)
19334 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19338 return I->getFastMathFlags().noNaNs();
19341 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19344 return I->isAssociative();
19353 return I->getOperand(2);
19354 return I->getOperand(
Index);
19362 case RecurKind::Or:
19368 case RecurKind::And:
19374 case RecurKind::Add:
19375 case RecurKind::Mul:
19376 case RecurKind::Xor:
19377 case RecurKind::FAdd:
19378 case RecurKind::FMul:
19381 case RecurKind::FMax:
19383 case RecurKind::FMin:
19385 case RecurKind::FMaximum:
19387 case RecurKind::FMinimum:
19389 case RecurKind::SMax:
19395 case RecurKind::SMin:
19401 case RecurKind::UMax:
19407 case RecurKind::UMin:
19422 const ReductionOpsListType &ReductionOps) {
19423 bool UseSelect = ReductionOps.size() == 2 ||
19425 (ReductionOps.size() == 1 &&
19426 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19427 assert((!UseSelect || ReductionOps.size() != 2 ||
19428 isa<SelectInst>(ReductionOps[1][0])) &&
19429 "Expected cmp + select pairs for reduction");
19432 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19446 auto *
I = dyn_cast<Instruction>(V);
19448 return RecurKind::None;
19450 return RecurKind::Add;
19452 return RecurKind::Mul;
19455 return RecurKind::And;
19458 return RecurKind::Or;
19460 return RecurKind::Xor;
19462 return RecurKind::FAdd;
19464 return RecurKind::FMul;
19467 return RecurKind::FMax;
19469 return RecurKind::FMin;
19472 return RecurKind::FMaximum;
19474 return RecurKind::FMinimum;
19480 return RecurKind::SMax;
19482 return RecurKind::SMin;
19484 return RecurKind::UMax;
19486 return RecurKind::UMin;
19488 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19510 if (!isa<ExtractElementInst>(
RHS) ||
19512 return RecurKind::None;
19514 if (!isa<ExtractElementInst>(
LHS) ||
19516 return RecurKind::None;
19518 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19519 return RecurKind::None;
19523 return RecurKind::None;
19528 return RecurKind::None;
19531 return RecurKind::SMax;
19534 return RecurKind::SMin;
19537 return RecurKind::UMax;
19540 return RecurKind::UMin;
19543 return RecurKind::None;
19547 static unsigned getFirstOperandIndex(
Instruction *
I) {
19548 return isCmpSelMinMax(
I) ? 1 : 0;
19554 return isCmpSelMinMax(
I) ? 3 : 2;
19560 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19561 auto *Sel = cast<SelectInst>(
I);
19562 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19563 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19565 return I->getParent() == BB;
19569 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19570 if (IsCmpSelMinMax) {
19573 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19574 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19575 return I->hasNUses(2);
19579 return I->hasOneUse();
19584 if (isCmpSelMinMax(
I))
19585 ReductionOps.assign(2, ReductionOpsType());
19587 ReductionOps.assign(1, ReductionOpsType());
19592 if (isCmpSelMinMax(
I)) {
19593 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19594 ReductionOps[1].emplace_back(
I);
19596 ReductionOps[0].emplace_back(
I);
19601 int Sz = Data.size();
19602 auto *
I = dyn_cast<Instruction>(Data.front());
19603 return Sz > 1 ||
isConstant(Data.front()) ||
19614 RdxKind = HorizontalReduction::getRdxKind(Root);
19615 if (!isVectorizable(RdxKind, Root))
19626 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19627 if (!Sel->getCondition()->hasOneUse())
19630 ReductionRoot = Root;
19635 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19637 1, std::make_pair(Root, 0));
19645 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19646 getNumberOfOperands(TreeN)))) {
19647 Value *EdgeVal = getRdxOperand(TreeN,
I);
19648 ReducedValsToOps[EdgeVal].push_back(TreeN);
19649 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19656 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19657 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19658 !isVectorizable(RdxKind, EdgeInst) ||
19659 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19660 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19661 PossibleReducedVals.push_back(EdgeVal);
19664 ReductionOps.push_back(EdgeInst);
19675 PossibleReducedVals;
19676 initReductionOps(Root);
19680 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19684 if (!LoadKeyUsed.
insert(Key).second) {
19685 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19686 if (LIt != LoadsMap.
end()) {
19687 for (
LoadInst *RLI : LIt->second) {
19693 for (
LoadInst *RLI : LIt->second) {
19700 if (LIt->second.size() > 2) {
19702 hash_value(LIt->second.back()->getPointerOperand());
19708 .first->second.push_back(LI);
19712 while (!Worklist.empty()) {
19713 auto [TreeN, Level] = Worklist.pop_back_val();
19716 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19717 addReductionOps(TreeN);
19720 for (
Value *V : PossibleRedVals) {
19724 ++PossibleReducedVals[
Key][
Idx]
19725 .
insert(std::make_pair(V, 0))
19729 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19731 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19734 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19735 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19737 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19740 auto RedValsVect = It->second.takeVector();
19742 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19743 PossibleRedValsVect.
back().append(Data.second, Data.first);
19745 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19746 return P1.size() > P2.size();
19751 (!isGoodForReduction(Data) &&
19752 (!isa<LoadInst>(Data.front()) ||
19753 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19755 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19757 cast<LoadInst>(ReducedVals[NewIdx].front())
19759 NewIdx = ReducedVals.
size();
19762 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19777 constexpr unsigned RegMaxNumber = 4;
19778 constexpr unsigned RedValsMaxNumber = 128;
19782 if (
unsigned NumReducedVals = std::accumulate(
19783 ReducedVals.
begin(), ReducedVals.
end(), 0,
19785 if (!isGoodForReduction(Vals))
19787 return Num + Vals.size();
19789 NumReducedVals < ReductionLimit &&
19793 for (ReductionOpsType &RdxOps : ReductionOps)
19794 for (
Value *RdxOp : RdxOps)
19795 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19806 ReducedVals.
front().size());
19810 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19811 assert(isa<SelectInst>(RdxRootInst) &&
19812 "Expected min/max reduction to have select root instruction");
19813 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19814 assert(isa<Instruction>(ScalarCond) &&
19815 "Expected min/max reduction to have compare condition");
19816 return cast<Instruction>(ScalarCond);
19819 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19820 return isBoolLogicOp(cast<Instruction>(V));
19823 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19824 if (VectorizedTree) {
19827 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19828 if (AnyBoolLogicOp) {
19829 auto It = ReducedValsToOps.
find(VectorizedTree);
19830 auto It1 = ReducedValsToOps.
find(Res);
19831 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19833 (It != ReducedValsToOps.
end() &&
19835 return isBoolLogicOp(I) &&
19836 getRdxOperand(I, 0) == VectorizedTree;
19840 (It1 != ReducedValsToOps.
end() &&
19842 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19846 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19850 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19857 ReductionOps.front().size());
19858 for (ReductionOpsType &RdxOps : ReductionOps)
19859 for (
Value *RdxOp : RdxOps) {
19862 IgnoreList.insert(RdxOp);
19867 for (
Value *U : IgnoreList)
19868 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19869 RdxFMF &= FPMO->getFastMathFlags();
19870 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19875 for (
Value *V : Candidates)
19876 TrackedVals.try_emplace(V, V);
19879 Value *
V) ->
unsigned & {
19880 auto *It = MV.
find(V);
19881 assert(It != MV.
end() &&
"Unable to find given key.");
19890 bool CheckForReusedReductionOps =
false;
19895 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19897 InstructionsState S = States[
I];
19901 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19902 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19907 auto *Inst = dyn_cast<Instruction>(RdxVal);
19909 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
19910 (S.getOpcode() && !Inst))
19913 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19915 bool ShuffledExtracts =
false;
19917 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
19920 for (
Value *RV : ReducedVals[
I + 1]) {
19921 Value *RdxVal = TrackedVals.at(RV);
19925 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19928 CommonCandidates.push_back(RdxVal);
19929 TrackedToOrig.try_emplace(RdxVal, RV);
19934 Candidates.
swap(CommonCandidates);
19935 ShuffledExtracts =
true;
19942 Value *OrigV = TrackedToOrig.at(Candidates.
front());
19943 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19945 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
19946 Value *OrigV = TrackedToOrig.at(VC);
19947 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19948 if (
auto *ResI = dyn_cast<Instruction>(Res))
19949 V.analyzedReductionRoot(ResI);
19951 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19955 unsigned NumReducedVals = Candidates.
size();
19956 if (NumReducedVals < ReductionLimit &&
19957 (NumReducedVals < 2 || !
isSplat(Candidates)))
19962 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
19963 RdxKind != RecurKind::FMul &&
19964 RdxKind != RecurKind::FMulAdd;
19967 if (IsSupportedHorRdxIdentityOp)
19968 for (
Value *V : Candidates) {
19969 Value *OrigV = TrackedToOrig.at(V);
19970 ++SameValuesCounter.
try_emplace(OrigV).first->second;
19982 bool SameScaleFactor =
false;
19983 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
19984 SameValuesCounter.
size() != Candidates.size();
19986 if (OptReusedScalars) {
19988 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
19989 RdxKind == RecurKind::Xor) &&
19991 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
19992 return P.second == SameValuesCounter.
front().second;
19994 Candidates.resize(SameValuesCounter.
size());
19995 transform(SameValuesCounter, Candidates.begin(),
19996 [&](
const auto &
P) { return TrackedVals.at(P.first); });
19997 NumReducedVals = Candidates.size();
19999 if (NumReducedVals == 1) {
20000 Value *OrigV = TrackedToOrig.at(Candidates.front());
20001 unsigned Cnt = At(SameValuesCounter, OrigV);
20003 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20004 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20005 VectorizedVals.try_emplace(OrigV, Cnt);
20006 ExternallyUsedValues.
insert(OrigV);
20011 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20012 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20013 const unsigned MaxElts = std::clamp<unsigned>(
20015 RegMaxNumber * RedValsMaxNumber);
20017 unsigned ReduxWidth = NumReducedVals;
20018 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20019 unsigned NumParts, NumRegs;
20020 Type *ScalarTy = Candidates.front()->getType();
20027 while (NumParts > NumRegs) {
20028 ReduxWidth =
bit_floor(ReduxWidth - 1);
20034 if (NumParts > NumRegs / 2)
20039 ReduxWidth = GetVectorFactor(ReduxWidth);
20040 ReduxWidth = std::min(ReduxWidth, MaxElts);
20042 unsigned Start = 0;
20043 unsigned Pos = Start;
20045 unsigned PrevReduxWidth = ReduxWidth;
20046 bool CheckForReusedReductionOpsLocal =
false;
20047 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20048 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20049 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20052 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20055 if (Pos < NumReducedVals - ReduxWidth + 1)
20056 return IsAnyRedOpGathered;
20059 if (ReduxWidth > 1)
20060 ReduxWidth = GetVectorFactor(ReduxWidth);
20061 return IsAnyRedOpGathered;
20063 bool AnyVectorized =
false;
20065 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20066 ReduxWidth >= ReductionLimit) {
20069 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20071 CheckForReusedReductionOps =
true;
20074 PrevReduxWidth = ReduxWidth;
20077 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20080 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20082 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20084 V.areAnalyzedReductionVals(VL)) {
20085 (void)AdjustReducedVals(
true);
20091 auto *RedValI = dyn_cast<Instruction>(RedVal);
20094 return V.isDeleted(RedValI);
20097 V.buildTree(VL, IgnoreList);
20098 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20099 if (!AdjustReducedVals())
20100 V.analyzedReductionVals(VL);
20103 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20104 if (!AdjustReducedVals())
20105 V.analyzedReductionVals(VL);
20108 V.reorderTopToBottom();
20110 V.reorderBottomToTop(
true);
20114 ExternallyUsedValues);
20118 LocalExternallyUsedValues.insert(ReductionRoot);
20119 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20120 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20122 for (
Value *V : ReducedVals[Cnt])
20123 if (isa<Instruction>(V))
20124 LocalExternallyUsedValues.insert(TrackedVals[V]);
20126 if (!IsSupportedHorRdxIdentityOp) {
20129 "Reused values counter map is not empty");
20130 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20131 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20133 Value *
V = Candidates[Cnt];
20134 Value *OrigV = TrackedToOrig.at(V);
20135 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20138 V.transformNodes();
20142 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20143 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20145 Value *RdxVal = Candidates[Cnt];
20146 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20147 RdxVal = It->second;
20148 if (!Visited.
insert(RdxVal).second)
20152 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20153 LocalExternallyUsedValues.insert(RdxVal);
20156 Value *OrigV = TrackedToOrig.at(RdxVal);
20158 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20159 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20160 LocalExternallyUsedValues.insert(RdxVal);
20163 if (!IsSupportedHorRdxIdentityOp)
20164 SameValuesCounter.
clear();
20165 for (
Value *RdxVal : VL)
20166 if (RequiredExtract.
contains(RdxVal))
20167 LocalExternallyUsedValues.insert(RdxVal);
20168 V.buildExternalUses(LocalExternallyUsedValues);
20170 V.computeMinimumValueSizes();
20175 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20178 <<
" for reduction\n");
20182 V.getORE()->emit([&]() {
20184 ReducedValsToOps.
at(VL[0]).front())
20185 <<
"Vectorizing horizontal reduction is possible "
20186 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20187 <<
" and threshold "
20190 if (!AdjustReducedVals()) {
20191 V.analyzedReductionVals(VL);
20192 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20193 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20196 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20197 VF >= ReductionLimit;
20199 *
TTI, VL.front()->getType(), VF - 1)) {
20201 V.getCanonicalGraphSize() !=
V.getTreeSize())
20203 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20211 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20212 <<
Cost <<
". (HorRdx)\n");
20213 V.getORE()->emit([&]() {
20215 ReducedValsToOps.
at(VL[0]).front())
20216 <<
"Vectorized horizontal reduction with cost "
20217 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20218 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20225 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20227 if (IsCmpSelMinMax)
20228 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20231 Value *VectorizedRoot =
20232 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20235 for (
Value *RdxVal : Candidates) {
20236 Value *OrigVal = TrackedToOrig.at(RdxVal);
20237 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20238 if (TransformedRdxVal != RdxVal)
20239 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20248 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20251 if (OptReusedScalars && !SameScaleFactor) {
20252 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20253 SameValuesCounter, TrackedToOrig);
20256 Value *ReducedSubTree;
20257 Type *ScalarTy = VL.front()->getType();
20258 if (isa<FixedVectorType>(ScalarTy)) {
20263 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20281 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20284 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20287 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20288 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20289 "Expected different reduction type.");
20291 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20292 V.isSignedMinBitwidthRootNode());
20298 if (OptReusedScalars && SameScaleFactor)
20299 ReducedSubTree = emitScaleForReusedOps(
20300 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20302 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20304 for (
Value *RdxVal : VL) {
20305 Value *OrigV = TrackedToOrig.at(RdxVal);
20306 if (IsSupportedHorRdxIdentityOp) {
20307 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20310 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20311 if (!
V.isVectorized(RdxVal))
20312 RequiredExtract.
insert(RdxVal);
20316 ReduxWidth = NumReducedVals - Pos;
20317 if (ReduxWidth > 1)
20318 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20319 AnyVectorized =
true;
20321 if (OptReusedScalars && !AnyVectorized) {
20322 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20323 Value *RdxVal = TrackedVals.at(
P.first);
20324 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20325 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20326 VectorizedVals.try_emplace(
P.first,
P.second);
20331 if (VectorizedTree) {
20352 if (!AnyBoolLogicOp)
20354 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20355 getRdxOperand(RedOp1, 0) ==
LHS ||
20358 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20359 getRdxOperand(RedOp2, 0) ==
RHS ||
20364 if (
LHS != VectorizedTree)
20375 unsigned Sz = InstVals.
size();
20378 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20381 Value *RdxVal1 = InstVals[
I].second;
20382 Value *StableRdxVal1 = RdxVal1;
20383 auto It1 = TrackedVals.find(RdxVal1);
20384 if (It1 != TrackedVals.end())
20385 StableRdxVal1 = It1->second;
20386 Value *RdxVal2 = InstVals[
I + 1].second;
20387 Value *StableRdxVal2 = RdxVal2;
20388 auto It2 = TrackedVals.find(RdxVal2);
20389 if (It2 != TrackedVals.end())
20390 StableRdxVal2 = It2->second;
20394 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20396 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20397 StableRdxVal2,
"op.rdx", ReductionOps);
20398 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20401 ExtraReds[Sz / 2] = InstVals.
back();
20405 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20409 for (
Value *RdxVal : Candidates) {
20410 if (!Visited.
insert(RdxVal).second)
20412 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20419 bool InitStep =
true;
20420 while (ExtraReductions.
size() > 1) {
20422 FinalGen(ExtraReductions, InitStep);
20423 ExtraReductions.
swap(NewReds);
20426 VectorizedTree = ExtraReductions.
front().second;
20428 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20437 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20444 for (
auto *U :
Ignore->users()) {
20446 "All users must be either in the reduction ops list.");
20449 if (!
Ignore->use_empty()) {
20451 Ignore->replaceAllUsesWith(
P);
20454 V.removeInstructionsAndOperands(RdxOps);
20456 }
else if (!CheckForReusedReductionOps) {
20457 for (ReductionOpsType &RdxOps : ReductionOps)
20458 for (
Value *RdxOp : RdxOps)
20459 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20461 return VectorizedTree;
20471 Type *ScalarTy = ReducedVals.
front()->getType();
20472 unsigned ReduxWidth = ReducedVals.
size();
20481 int Cnt = ReducedVals.
size();
20482 for (
Value *RdxVal : ReducedVals) {
20487 Cost += GenCostFn();
20492 auto *RdxOp = cast<Instruction>(U);
20493 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20501 Cost += ScalarCost;
20503 Cost += GenCostFn();
20508 case RecurKind::Add:
20509 case RecurKind::Mul:
20510 case RecurKind::Or:
20511 case RecurKind::And:
20512 case RecurKind::Xor:
20513 case RecurKind::FAdd:
20514 case RecurKind::FMul: {
20517 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20520 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20532 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20533 std::make_pair(RedTy,
true));
20534 if (RType == RedTy) {
20544 ScalarCost = EvaluateScalarCost([&]() {
20549 case RecurKind::FMax:
20550 case RecurKind::FMin:
20551 case RecurKind::FMaximum:
20552 case RecurKind::FMinimum:
20553 case RecurKind::SMax:
20554 case RecurKind::SMin:
20555 case RecurKind::UMax:
20556 case RecurKind::UMin: {
20560 ScalarCost = EvaluateScalarCost([&]() {
20570 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20572 <<
" (It is a splitting reduction)\n");
20573 return VectorCost - ScalarCost;
20579 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20580 assert(RdxKind != RecurKind::FMulAdd &&
20581 "A call to the llvm.fmuladd intrinsic is not handled yet");
20583 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20584 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20585 RdxKind == RecurKind::Add &&
20590 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20591 ++NumVectorInstructions;
20594 ++NumVectorInstructions;
20601 assert(IsSupportedHorRdxIdentityOp &&
20602 "The optimization of matched scalar identity horizontal reductions "
20603 "must be supported.");
20605 return VectorizedValue;
20607 case RecurKind::Add: {
20609 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20611 << VectorizedValue <<
". (HorRdx)\n");
20612 return Builder.
CreateMul(VectorizedValue, Scale);
20614 case RecurKind::Xor: {
20616 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20617 <<
". (HorRdx)\n");
20620 return VectorizedValue;
20622 case RecurKind::FAdd: {
20624 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20626 << VectorizedValue <<
". (HorRdx)\n");
20627 return Builder.
CreateFMul(VectorizedValue, Scale);
20629 case RecurKind::And:
20630 case RecurKind::Or:
20631 case RecurKind::SMax:
20632 case RecurKind::SMin:
20633 case RecurKind::UMax:
20634 case RecurKind::UMin:
20635 case RecurKind::FMax:
20636 case RecurKind::FMin:
20637 case RecurKind::FMaximum:
20638 case RecurKind::FMinimum:
20640 return VectorizedValue;
20641 case RecurKind::Mul:
20642 case RecurKind::FMul:
20643 case RecurKind::FMulAdd:
20644 case RecurKind::IAnyOf:
20645 case RecurKind::FAnyOf:
20646 case RecurKind::IFindLastIV:
20647 case RecurKind::FFindLastIV:
20648 case RecurKind::None:
20660 assert(IsSupportedHorRdxIdentityOp &&
20661 "The optimization of matched scalar identity horizontal reductions "
20662 "must be supported.");
20664 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20665 if (VTy->getElementType() != VL.
front()->getType()) {
20669 R.isSignedMinBitwidthRootNode());
20672 case RecurKind::Add: {
20675 for (
Value *V : VL) {
20676 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20677 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20681 << VectorizedValue <<
". (HorRdx)\n");
20682 return Builder.
CreateMul(VectorizedValue, Scale);
20684 case RecurKind::And:
20685 case RecurKind::Or:
20688 <<
". (HorRdx)\n");
20689 return VectorizedValue;
20690 case RecurKind::SMax:
20691 case RecurKind::SMin:
20692 case RecurKind::UMax:
20693 case RecurKind::UMin:
20694 case RecurKind::FMax:
20695 case RecurKind::FMin:
20696 case RecurKind::FMaximum:
20697 case RecurKind::FMinimum:
20700 <<
". (HorRdx)\n");
20701 return VectorizedValue;
20702 case RecurKind::Xor: {
20708 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20710 std::iota(
Mask.begin(),
Mask.end(), 0);
20711 bool NeedShuffle =
false;
20712 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20714 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20715 if (Cnt % 2 == 0) {
20717 NeedShuffle =
true;
20723 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20727 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20728 return VectorizedValue;
20730 case RecurKind::FAdd: {
20733 for (
Value *V : VL) {
20734 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20735 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20738 return Builder.
CreateFMul(VectorizedValue, Scale);
20740 case RecurKind::Mul:
20741 case RecurKind::FMul:
20742 case RecurKind::FMulAdd:
20743 case RecurKind::IAnyOf:
20744 case RecurKind::FAnyOf:
20745 case RecurKind::IFindLastIV:
20746 case RecurKind::FFindLastIV:
20747 case RecurKind::None:
20757 return HorizontalReduction::getRdxKind(V);
20760 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20761 return cast<FixedVectorType>(IE->getType())->getNumElements();
20763 unsigned AggregateSize = 1;
20764 auto *
IV = cast<InsertValueInst>(InsertInst);
20765 Type *CurrentType =
IV->getType();
20767 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20768 for (
auto *Elt : ST->elements())
20769 if (Elt != ST->getElementType(0))
20770 return std::nullopt;
20771 AggregateSize *= ST->getNumElements();
20772 CurrentType = ST->getElementType(0);
20773 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20774 AggregateSize *= AT->getNumElements();
20775 CurrentType = AT->getElementType();
20776 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20777 AggregateSize *= VT->getNumElements();
20778 return AggregateSize;
20780 return AggregateSize;
20782 return std::nullopt;
20791 unsigned OperandOffset,
const BoUpSLP &R) {
20794 std::optional<unsigned> OperandIndex =
20796 if (!OperandIndex || R.isDeleted(LastInsertInst))
20798 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20800 BuildVectorOpds, InsertElts, *OperandIndex, R);
20803 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20804 InsertElts[*OperandIndex] = LastInsertInst;
20806 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20807 }
while (LastInsertInst !=
nullptr &&
20808 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20832 assert((isa<InsertElementInst>(LastInsertInst) ||
20833 isa<InsertValueInst>(LastInsertInst)) &&
20834 "Expected insertelement or insertvalue instruction!");
20837 "Expected empty result vectors!");
20840 if (!AggregateSize)
20842 BuildVectorOpds.
resize(*AggregateSize);
20843 InsertElts.
resize(*AggregateSize);
20849 if (BuildVectorOpds.
size() >= 2)
20867 auto DominatedReduxValue = [&](
Value *R) {
20868 return isa<Instruction>(R) &&
20869 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20875 if (
P->getIncomingBlock(0) == ParentBB) {
20876 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20877 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20878 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20881 if (Rdx && DominatedReduxValue(Rdx))
20894 if (
P->getIncomingBlock(0) == BBLatch) {
20895 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20896 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20897 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20900 if (Rdx && DominatedReduxValue(Rdx))
20934 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20935 isa<IntrinsicInst>(Root)) &&
20936 "Expected binop, select, or intrinsic for reduction matching");
20938 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20940 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20942 return dyn_cast<Instruction>(
RHS);
20944 return dyn_cast<Instruction>(
LHS);
20951 Value *Op0 =
nullptr;
20952 Value *Op1 =
nullptr;
20955 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20961 Value *B0 =
nullptr, *B1 =
nullptr;
20966bool SLPVectorizerPass::vectorizeHorReduction(
20971 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
20973 if (Root->
getParent() != BB || isa<PHINode>(Root))
20977 auto SelectRoot = [&]() {
20996 std::queue<std::pair<Instruction *, unsigned>>
Stack;
20997 Stack.emplace(SelectRoot(), 0);
21001 if (
R.isAnalyzedReductionRoot(Inst))
21006 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21008 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21010 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21011 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21018 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21023 while (!
Stack.empty()) {
21026 std::tie(Inst, Level) =
Stack.front();
21031 if (
R.isDeleted(Inst))
21033 if (
Value *VectorizedV = TryToReduce(Inst)) {
21035 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21037 Stack.emplace(
I, Level);
21040 if (
R.isDeleted(Inst))
21044 if (!TryAppendToPostponedInsts(Inst)) {
21055 if (VisitedInstrs.
insert(
Op).second)
21056 if (
auto *
I = dyn_cast<Instruction>(
Op))
21059 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21060 !
R.isDeleted(
I) &&
I->getParent() == BB)
21061 Stack.emplace(
I, Level);
21069 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21070 Res |= tryToVectorize(PostponedInsts, R);
21077 for (
Value *V : Insts)
21078 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21079 Res |= tryToVectorize(Inst, R);
21083bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21086 if (!
R.canMapToVector(IVI->
getType()))
21094 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21095 R.getORE()->emit([&]() {
21097 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21098 "trying reduction first.";
21102 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21104 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21114 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21118 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21119 R.getORE()->emit([&]() {
21121 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21122 "trying reduction first.";
21126 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21127 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21130template <
typename T>
21135 bool MaxVFOnly,
BoUpSLP &R) {
21136 bool Changed =
false;
21147 auto *
I = dyn_cast<Instruction>(*IncIt);
21148 if (!
I || R.isDeleted(
I)) {
21152 auto *SameTypeIt = IncIt;
21153 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21154 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21155 AreCompatible(*SameTypeIt, *IncIt))) {
21156 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21158 if (
I && !R.isDeleted(
I))
21163 unsigned NumElts = VL.
size();
21164 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21165 << NumElts <<
")\n");
21175 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21178 VL.
swap(Candidates);
21179 Candidates.
clear();
21181 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21187 auto GetMinNumElements = [&R](
Value *V) {
21188 unsigned EltSize = R.getVectorElementSize(V);
21189 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21191 if (NumElts < GetMinNumElements(*IncIt) &&
21192 (Candidates.
empty() ||
21193 Candidates.
front()->getType() == (*IncIt)->getType())) {
21195 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21201 if (Candidates.
size() > 1 &&
21202 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21203 if (TryToVectorizeHelper(Candidates,
false)) {
21206 }
else if (MaxVFOnly) {
21209 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21211 auto *
I = dyn_cast<Instruction>(*It);
21212 if (!
I || R.isDeleted(
I)) {
21216 auto *SameTypeIt = It;
21217 while (SameTypeIt !=
End &&
21218 (!isa<Instruction>(*SameTypeIt) ||
21219 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21220 AreCompatible(*SameTypeIt, *It))) {
21221 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21223 if (
I && !R.isDeleted(
I))
21226 unsigned NumElts = VL.
size();
21227 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21233 Candidates.
clear();
21237 IncIt = SameTypeIt;
21249template <
bool IsCompatibility>
21254 "Expected valid element types only.");
21256 return IsCompatibility;
21257 auto *CI1 = cast<CmpInst>(V);
21258 auto *CI2 = cast<CmpInst>(V2);
21259 if (CI1->getOperand(0)->getType()->getTypeID() <
21261 return !IsCompatibility;
21262 if (CI1->getOperand(0)->getType()->getTypeID() >
21265 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21267 return !IsCompatibility;
21268 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21277 if (BasePred1 < BasePred2)
21278 return !IsCompatibility;
21279 if (BasePred1 > BasePred2)
21282 bool CI1Preds = Pred1 == BasePred1;
21283 bool CI2Preds = Pred2 == BasePred1;
21284 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21285 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21286 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21290 return !IsCompatibility;
21293 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21294 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21295 if (IsCompatibility) {
21296 if (I1->getParent() != I2->getParent())
21303 return NodeI2 !=
nullptr;
21306 assert((NodeI1 == NodeI2) ==
21308 "Different nodes should have different DFS numbers");
21309 if (NodeI1 != NodeI2)
21313 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
21315 if (IsCompatibility)
21317 if (I1->getOpcode() != I2->getOpcode())
21318 return I1->getOpcode() < I2->getOpcode();
21321 return IsCompatibility;
21324template <
typename ItT>
21327 bool Changed =
false;
21330 if (
R.isDeleted(
I))
21333 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21334 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21335 if (
R.isDeleted(
I))
21341 if (
R.isDeleted(
I))
21343 Changed |= tryToVectorize(
I, R);
21350 return compareCmp<false>(V, V2, *TLI, *DT);
21353 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21356 return compareCmp<true>(V1, V2, *TLI, *DT);
21363 if (Vals.
size() <= 1)
21365 Changed |= tryToVectorizeSequence<Value>(
21366 Vals, CompareSorter, AreCompatibleCompares,
21369 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21371 auto *Select = dyn_cast<SelectInst>(U);
21373 Select->getParent() != cast<Instruction>(V)->getParent();
21376 if (ArePossiblyReducedInOtherBlock)
21378 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21384bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21386 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21387 "This function only accepts Insert instructions");
21388 bool OpsChanged =
false;
21390 for (
auto *
I :
reverse(Instructions)) {
21392 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21394 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21396 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21397 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21399 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21402 if (
R.isDeleted(
I))
21404 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21405 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21408 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21410 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21411 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21412 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21417 OpsChanged |= tryToVectorize(PostponedInsts, R);
21424 bool Changed =
false;
21431 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21434 "Expected vectorizable types only.");
21442 V2->getType()->getScalarSizeInBits())
21445 V2->getType()->getScalarSizeInBits())
21449 if (Opcodes1.
size() < Opcodes2.
size())
21451 if (Opcodes1.
size() > Opcodes2.
size())
21453 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21456 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21457 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21462 return NodeI2 !=
nullptr;
21465 assert((NodeI1 == NodeI2) ==
21467 "Different nodes should have different DFS numbers");
21468 if (NodeI1 != NodeI2)
21471 if (S.getOpcode() && !S.isAltShuffle())
21473 return I1->getOpcode() < I2->getOpcode();
21482 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21483 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21491 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21492 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21496 auto ValID1 = Opcodes1[
I]->getValueID();
21497 auto ValID2 = Opcodes2[
I]->getValueID();
21498 if (ValID1 == ValID2)
21500 if (ValID1 < ValID2)
21502 if (ValID1 > ValID2)
21511 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21515 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21518 if (V1->getType() !=
V2->getType())
21522 if (Opcodes1.
size() != Opcodes2.
size())
21524 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21526 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21528 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21529 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21530 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21532 if (
I1->getParent() != I2->getParent())
21539 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21541 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21547 bool HaveVectorizedPhiNodes =
false;
21552 auto *
P = dyn_cast<PHINode>(&
I);
21558 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21571 if (!Opcodes.
empty())
21575 while (!Nodes.
empty()) {
21576 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21579 for (
Value *V :
PHI->incoming_values()) {
21580 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21581 Nodes.push_back(PHI1);
21589 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21590 Incoming, PHICompare, AreCompatiblePHIs,
21592 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21595 Changed |= HaveVectorizedPhiNodes;
21596 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21597 auto *
PHI = dyn_cast<PHINode>(
P.first);
21598 return !
PHI ||
R.isDeleted(
PHI);
21600 PHIToOpcodes.
clear();
21602 }
while (HaveVectorizedPhiNodes);
21604 VisitedInstrs.
clear();
21606 InstSetVector PostProcessInserts;
21610 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21611 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21612 if (VectorizeCmps) {
21613 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21614 PostProcessCmps.
clear();
21616 PostProcessInserts.clear();
21621 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21622 return PostProcessCmps.
contains(Cmp);
21623 return isa<InsertElementInst, InsertValueInst>(
I) &&
21624 PostProcessInserts.contains(
I);
21630 return I->use_empty() &&
21631 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21636 if (isa<ScalableVectorType>(It->getType()))
21640 if (
R.isDeleted(&*It))
21643 if (!VisitedInstrs.
insert(&*It).second) {
21644 if (HasNoUsers(&*It) &&
21645 VectorizeInsertsAndCmps(It->isTerminator())) {
21655 if (isa<DbgInfoIntrinsic>(It))
21659 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21661 if (
P->getNumIncomingValues() == 2) {
21664 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21673 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21678 if (BB ==
P->getIncomingBlock(
I) ||
21684 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21685 PI && !IsInPostProcessInstrs(PI)) {
21687 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21689 if (Res &&
R.isDeleted(
P)) {
21699 if (HasNoUsers(&*It)) {
21700 bool OpsChanged =
false;
21701 auto *
SI = dyn_cast<StoreInst>(It);
21711 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21712 SI->getValueOperand()->hasOneUse();
21714 if (TryToVectorizeRoot) {
21715 for (
auto *V : It->operand_values()) {
21718 if (
auto *VI = dyn_cast<Instruction>(V);
21719 VI && !IsInPostProcessInstrs(VI))
21721 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21728 VectorizeInsertsAndCmps(It->isTerminator());
21739 if (isa<InsertElementInst, InsertValueInst>(It))
21740 PostProcessInserts.insert(&*It);
21741 else if (isa<CmpInst>(It))
21742 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21749 auto Changed =
false;
21750 for (
auto &Entry : GEPs) {
21753 if (
Entry.second.size() < 2)
21756 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21757 <<
Entry.second.size() <<
".\n");
21765 return !R.isDeleted(GEP);
21767 if (It ==
Entry.second.end())
21769 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21770 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21771 if (MaxVecRegSize < EltSize)
21774 unsigned MaxElts = MaxVecRegSize / EltSize;
21775 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21776 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21789 Candidates.remove_if([&R](
Value *
I) {
21790 return R.isDeleted(cast<Instruction>(
I)) ||
21791 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21799 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21800 auto *GEPI = GEPList[
I];
21801 if (!Candidates.count(GEPI))
21804 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21805 auto *GEPJ = GEPList[J];
21807 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21808 Candidates.remove(GEPI);
21809 Candidates.remove(GEPJ);
21810 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21811 Candidates.remove(GEPJ);
21818 if (Candidates.
size() < 2)
21825 auto BundleIndex = 0
u;
21826 for (
auto *V : Candidates) {
21827 auto *
GEP = cast<GetElementPtrInst>(V);
21828 auto *GEPIdx =
GEP->idx_begin()->get();
21829 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21830 Bundle[BundleIndex++] = GEPIdx;
21842 Changed |= tryToVectorizeList(Bundle, R);
21848bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21849 bool Changed =
false;
21854 if (
V->getValueOperand()->getType()->getTypeID() <
21855 V2->getValueOperand()->getType()->getTypeID())
21857 if (
V->getValueOperand()->getType()->getTypeID() >
21858 V2->getValueOperand()->getType()->getTypeID())
21860 if (
V->getPointerOperandType()->getTypeID() <
21861 V2->getPointerOperandType()->getTypeID())
21863 if (
V->getPointerOperandType()->getTypeID() >
21864 V2->getPointerOperandType()->getTypeID())
21866 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21867 V2->getValueOperand()->getType()->getScalarSizeInBits())
21869 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21870 V2->getValueOperand()->getType()->getScalarSizeInBits())
21873 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21874 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21878 DT->
getNode(I2->getParent());
21879 assert(NodeI1 &&
"Should only process reachable instructions");
21880 assert(NodeI2 &&
"Should only process reachable instructions");
21881 assert((NodeI1 == NodeI2) ==
21883 "Different nodes should have different DFS numbers");
21884 if (NodeI1 != NodeI2)
21886 return I1->getOpcode() < I2->getOpcode();
21888 return V->getValueOperand()->getValueID() <
21889 V2->getValueOperand()->getValueID();
21901 isa<UndefValue>(
V2->getValueOperand()))
21904 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21905 if (
I1->getParent() != I2->getParent())
21908 return S.getOpcode() > 0;
21911 isa<Constant>(
V2->getValueOperand()))
21914 V2->getValueOperand()->getValueID();
21919 for (
auto &Pair : Stores) {
21920 if (Pair.second.size() < 2)
21924 << Pair.second.size() <<
".\n");
21933 Pair.second.rend());
21934 Changed |= tryToVectorizeSequence<StoreInst>(
21935 ReversedStores, StoreSorter, AreCompatibleStores,
21937 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.