73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
107using namespace std::placeholders;
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115 "Controls which SLP graphs should be vectorized.");
119 cl::desc(
"Run the SLP vectorization passes"));
123 cl::desc(
"Enable vectorization for wider vector utilization"));
127 cl::desc(
"Only vectorize if you gain more than this "
132 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
137 cl::desc(
"Attempt to vectorize horizontal reductions"));
142 "Attempt to vectorize horizontal reductions feeding into a store"));
146 cl::desc(
"Attempt to vectorize for this register size in bits"));
150 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
158 cl::desc(
"Limit the size of the SLP scheduling region per block"));
162 cl::desc(
"Attempt to vectorize for this register size in bits"));
166 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
176 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
185 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189 cl::desc(
"The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
194 cl::desc(
"The maximum stride, considered to be profitable."));
198 cl::desc(
"Display the SLP trees with Graphviz"));
202 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
233 if (
SLPReVec && isa<FixedVectorType>(Ty))
235 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
244 if (
auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (
auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (
auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
272 Type *Ty,
unsigned Sz) {
277 if (NumParts == 0 || NumParts >= Sz)
292 if (NumParts == 0 || NumParts >= Sz)
297 return (Sz / RegVF) * RegVF;
307 for (
unsigned I : seq<unsigned>(Mask.size()))
309 I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
342 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344 auto *SV = cast<ShuffleVectorInst>(VL.
front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353 unsigned NumGroup = 0;
354 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356 Value *Src = SV->getOperand(0);
360 auto *SV = cast<ShuffleVectorInst>(V);
362 if (SV->getOperand(0) != Src)
365 if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371 if (!ExpectedIndex.
all())
375 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
393 auto *SV = cast<ShuffleVectorInst>(VL.
front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397 unsigned AccumulateLength = 0;
398 for (
Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421 auto *
I = dyn_cast<Instruction>(V);
422 if (!
I || isa<ExtractValueInst>(
I))
424 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426 if (isa<ExtractElementInst>(
I))
428 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
444 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
453 OS <<
"Idx: " <<
Idx <<
", ";
454 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
462 auto *It =
find_if(VL, IsaPred<Instruction>);
471 if (isa<PoisonValue>(V))
473 auto *
II = dyn_cast<Instruction>(V);
477 if (BB !=
II->getParent())
494 Value *FirstNonUndef =
nullptr;
495 for (
Value *V : VL) {
496 if (isa<UndefValue>(V))
498 if (!FirstNonUndef) {
502 if (V != FirstNonUndef)
505 return FirstNonUndef !=
nullptr;
510 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511 return Cmp->isCommutative();
512 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539 return I->isCommutative();
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549 if (
const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556 if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
570 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
581 Type *CurrentType =
IV->getType();
582 for (
unsigned I :
IV->indices()) {
583 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
621 if (MaskArg == UseMask::UndefsAsMask)
625 if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
636template <
bool IsPoisonOnly = false>
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646 auto *
C = dyn_cast<Constant>(V);
648 if (!UseMask.empty()) {
650 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652 if (isa<T>(
II->getOperand(1)))
659 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
675 if (
Constant *Elem =
C->getAggregateElement(
I))
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
705static std::optional<TargetTransformInfo::ShuffleKind>
708 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
722 Value *Vec1 =
nullptr;
723 Value *Vec2 =
nullptr;
725 auto *EE = dyn_cast<ExtractElementInst>(V);
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
738 if (isa<UndefValue>(VL[
I]))
740 auto *EI = cast<ExtractElementInst>(VL[
I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743 auto *Vec = EI->getVectorOperand();
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
748 if (isa<UndefValue>(Vec)) {
751 if (isa<UndefValue>(EI->getIndexOperand()))
753 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
759 unsigned IntIdx =
Idx->getValue().getZExtValue();
766 if (!Vec1 || Vec1 == Vec) {
768 }
else if (!Vec2 || Vec2 == Vec) {
774 if (CommonShuffleMode == Permute)
778 if (Mask[
I] %
Size !=
I) {
779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
785 if (CommonShuffleMode ==
Select && Vec2)
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803 return CI->getZExtValue();
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
808 return *EI->idx_begin();
814class InstructionsState {
821 assert(valid() &&
"InstructionsState is invalid.");
826 assert(valid() &&
"InstructionsState is invalid.");
831 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
833 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
836 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
839 unsigned CheckedOpcode =
I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
844 bool valid()
const {
return MainOp && AltOp; }
846 explicit operator bool()
const {
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
890 "Assessing comparisons of different types?");
900 return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
912 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
915 auto *It =
find_if(VL, IsaPred<Instruction>);
917 return InstructionsState::invalid();
920 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
932 unsigned AltOpcode = Opcode;
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
938 for (
Value *V : VL) {
939 auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946 if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
953 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
959 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963 return InstructionsState::invalid();
965 bool AnyPoison = InstCnt != VL.
size();
968 auto *
I = dyn_cast<Instruction>(V);
975 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode =
I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(
I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
983 AltOpcode = InstOpcode;
987 }
else if (IsCastOp && isa<CastInst>(
I)) {
990 Value *Op1 =
I->getOperand(0);
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995 if (Opcode == AltOpcode) {
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1004 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1009 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1019 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1029 }
else if (BasePred != CurrentPred) {
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1041 }
else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1045 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1046 if (Gep->getNumOperands() != 2 ||
1048 return InstructionsState::invalid();
1049 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1051 return InstructionsState::invalid();
1052 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1057 auto *
CallBase = cast<CallInst>(MainOp);
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1066 return InstructionsState::invalid();
1069 return InstructionsState::invalid();
1072 if (Mappings.
size() != BaseMappings.
size() ||
1073 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1074 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1075 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1076 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1077 Mappings.
front().Shape.Parameters !=
1078 BaseMappings.
front().Shape.Parameters)
1079 return InstructionsState::invalid();
1084 return InstructionsState::invalid();
1087 return InstructionsState(MainOp, AltOp);
1104 unsigned Opcode = UserInst->
getOpcode();
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1131 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1138 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1139 return LI->isSimple();
1141 return SI->isSimple();
1143 return !
MI->isVolatile();
1151 bool ExtendingManyInputs =
false) {
1152 if (SubMask.
empty())
1155 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1158 "SubMask with many inputs support must be larger than the mask.");
1160 Mask.append(SubMask.
begin(), SubMask.
end());
1164 int TermValue = std::min(Mask.size(), SubMask.
size());
1165 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1167 (!ExtendingManyInputs &&
1168 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1170 NewMask[
I] = Mask[SubMask[
I]];
1186 const unsigned Sz = Order.
size();
1189 for (
unsigned I = 0;
I < Sz; ++
I) {
1191 UnusedIndices.
reset(Order[
I]);
1193 MaskedIndices.
set(
I);
1195 if (MaskedIndices.
none())
1198 "Non-synced masked/available indices.");
1202 assert(
Idx >= 0 &&
"Indices must be synced.");
1213 Type *ScalarTy = VL[0]->getType();
1216 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1219 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1220 OpcodeMask.
set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1231 const unsigned E = Indices.
size();
1233 for (
unsigned I = 0;
I < E; ++
I)
1234 Mask[Indices[
I]] =
I;
1240 assert(!Mask.empty() &&
"Expected non-empty mask.");
1244 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1246 Scalars[Mask[
I]] = Prev[
I];
1254 auto *
I = dyn_cast<Instruction>(V);
1259 auto *IO = dyn_cast<Instruction>(V);
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1271 auto *
I = dyn_cast<Instruction>(V);
1275 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1277 auto *IU = dyn_cast<Instruction>(U);
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1296 return !VL.
empty() &&
1312 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1316namespace slpvectorizer {
1321 struct ScheduleData;
1345 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1397 return !VectorizableTree.
empty() &&
1398 !VectorizableTree.
front()->UserTreeIndices.empty();
1403 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1404 return VectorizableTree.
front()->Scalars;
1410 const TreeEntry &Root = *VectorizableTree.
front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.
find(&Root);
1415 if (It != MinBWs.
end())
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1429 return MinBWs.
at(VectorizableTree.
front().get()).second;
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.
front()->Scalars.front()->getType()))
1440 VectorizableTree.
front()->Scalars.front()->getType(),
1441 VectorizableTree.
front()->getVectorFactor());
1444 VectorizableTree.
front()->Scalars.front()->getContext(),
1446 VectorizableTree.
front()->getVectorFactor());
1461 VectorizableTree.
clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1465 NonScheduledFirst.
clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.
clear();
1468 IsGraphTransformMode =
false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.
clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (
auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1477 ReductionBitWidth = 0;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.
clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList =
nullptr;
1483 PostponedGathers.
clear();
1484 ValueToGatherNodes.
clear();
1500 assert(!Order.
empty() &&
"expected non-empty order");
1501 const unsigned Sz = Order.
size();
1503 return P.value() ==
P.index() ||
P.value() == Sz;
1556 return MaxVecRegSize;
1561 return MinVecRegSize;
1569 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1571 return MaxVF ? MaxVF : UINT_MAX;
1623 unsigned *BestVF =
nullptr,
1624 bool TryRecursiveCheck =
true)
const;
1632 template <
typename T>
1659 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1660 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1682 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1737 if (isa<LoadInst>(V1)) {
1739 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1744 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1754 ((
int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1773 return CheckSameEntryOrFail();
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(),
DL, SE,
true);
1778 if (!Dist || *Dist == 0) {
1781 R.TTI->isLegalMaskedGather(
1784 return CheckSameEntryOrFail();
1788 if (std::abs(*Dist) > NumLanes / 2)
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1812 if (isa<UndefValue>(V2))
1816 Value *EV2 =
nullptr;
1829 int Dist = Idx2 - Idx1;
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1841 return CheckSameEntryOrFail();
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1857 !S.isAltShuffle()) &&
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1867 if (I1 && isa<PoisonValue>(V2))
1870 if (isa<UndefValue>(V2))
1873 return CheckSameEntryOrFail();
1907 int ShallowScoreAtThisLevel =
1916 auto *I1 = dyn_cast<Instruction>(
LHS);
1917 auto *I2 = dyn_cast<Instruction>(
RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 &&
"Should have early exited.");
1932 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest =
false;
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx &&
"Bad index");
1944 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1946 if (Op2Used.
count(OpIdx2))
1951 I1, I2, CurrLevel + 1, {});
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1962 Op2Used.
insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1966 return ShallowScoreAtThisLevel;
1997 struct OperandData {
1998 OperandData() =
default;
1999 OperandData(
Value *V,
bool APO,
bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2010 bool IsUsed =
false;
2019 enum class ReorderingMode {
2033 unsigned ArgSize = 0;
2039 const Loop *L =
nullptr;
2042 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2047 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2048 return OpsVec[OpIdx][Lane];
2053 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2057 OpsVec[OpIdx][Lane].IsUsed =
false;
2061 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2074 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2076 Value *IdxLaneV = getData(
Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2081 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2089 unsigned UniquesCount = Uniques.
size();
2090 auto IdxIt = Uniques.
find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2099 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2103 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2105 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2114 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2115 Value *IdxLaneV = getData(
Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2128 return R.areAllUsersVectorized(IdxLaneI)
2136 static const int ScoreScaleFactor = 10;
2144 int Lane,
unsigned OpIdx,
unsigned Idx,
2154 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2159 Score += SplatScore;
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2184 std::optional<unsigned>
2185 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2189 unsigned NumOperands = getNumOperands();
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2206 std::optional<unsigned>
Idx;
2210 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2220 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2222 OperandData &OpData = getData(
Idx, Lane);
2224 bool OpAPO = OpData.APO;
2233 if (OpAPO != OpIdxAPO)
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2242 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx,
Idx, IsUsed, UsedLanes);
2245 if (Score >
static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(
Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2258 if (isa<Constant>(
Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2263 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2267 case ReorderingMode::Splat:
2268 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2269 IsUsed =
Op == OpLastLane;
2270 if (
Op == OpLastLane) {
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2278 case ReorderingMode::Failed:
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2288 return std::nullopt;
2295 unsigned getBestLaneToStartReordering()
const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2308 for (
int I = getNumLanes();
I > 0; --
I) {
2309 unsigned Lane =
I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2337 if (
Data.second.first < CntMin) {
2338 CntMin =
Data.second.first;
2339 BestLane =
Data.second.second;
2346 struct OperandsOrderData {
2349 unsigned NumOfAPOs = UINT_MAX;
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2378 bool AllUndefs =
true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2383 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2389 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2395 Parent =
I->getParent();
2397 --NumOpsWithSameOpcodeParent;
2400 ++NumOpsWithSameOpcodeParent;
2404 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2409 OperandsOrderData
Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2419 assert((empty() || VL.
size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2423 constexpr unsigned IntrinsicNumOperands = 2;
2425 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2426 OpsVec.
resize(NumOperands);
2427 unsigned NumLanes = VL.
size();
2428 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2429 OpsVec[OpIdx].
resize(NumLanes);
2430 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2431 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2432 "Expected instruction or poison value");
2443 if (isa<PoisonValue>(VL[Lane])) {
2444 if (
auto *EI = dyn_cast<ExtractElementInst>(VL0)) {
2446 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),
true,
false};
2449 }
else if (
auto *EV = dyn_cast<ExtractValueInst>(VL0)) {
2451 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),
true,
false};
2455 OpsVec[OpIdx][Lane] = {
2460 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2461 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2462 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2469 unsigned getNumOperands()
const {
return ArgSize; }
2472 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2475 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2476 return getData(OpIdx, Lane).V;
2480 bool empty()
const {
return OpsVec.
empty(); }
2483 void clear() { OpsVec.
clear(); }
2488 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2489 assert(
Op == getValue(OpIdx, Lane) &&
2490 "Op is expected to be getValue(OpIdx, Lane).");
2492 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2494 bool OpAPO = getData(OpIdx, Lane).APO;
2495 bool IsInvariant = L && L->isLoopInvariant(
Op);
2497 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2501 bool FoundCandidate =
false;
2502 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2503 OperandData &
Data = getData(OpI, Ln);
2504 if (
Data.APO != OpAPO ||
Data.IsUsed)
2506 Value *OpILane = getValue(OpI, Lane);
2507 bool IsConstantOp = isa<Constant>(OpILane);
2516 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2521 isa<Constant>(
Data.V)))) ||
2528 (IsInvariant && !isa<Constant>(
Data.V) &&
2530 L->isLoopInvariant(
Data.V))) {
2531 FoundCandidate =
true;
2538 if (!FoundCandidate)
2541 return getNumLanes() == 2 || Cnt > 1;
2546 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2547 assert(
Op == getValue(OpIdx, Lane) &&
2548 "Op is expected to be getValue(OpIdx, Lane).");
2549 bool OpAPO = getData(OpIdx, Lane).APO;
2550 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2553 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2554 const OperandData &
Data = getData(OpI, Ln);
2555 if (
Data.APO != OpAPO ||
Data.IsUsed)
2557 Value *OpILn = getValue(OpI, Ln);
2558 return (L && L->isLoopInvariant(OpILn)) ||
2570 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2571 L(R.LI->getLoopFor((VL0->
getParent()))) {
2573 appendOperandsOfVL(RootVL, VL0);
2580 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2581 "Expected same num of lanes across all operands");
2582 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2583 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2591 unsigned NumOperands = getNumOperands();
2592 unsigned NumLanes = getNumLanes();
2612 unsigned FirstLane = getBestLaneToStartReordering();
2615 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2616 Value *OpLane0 = getValue(OpIdx, FirstLane);
2619 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2621 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2622 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2623 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2624 else if (isa<LoadInst>(OpILane0))
2625 ReorderingModes[OpIdx] = ReorderingMode::Load;
2627 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2628 }
else if (isa<Constant>(OpLane0)) {
2629 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2630 }
else if (isa<Argument>(OpLane0)) {
2632 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2642 auto &&SkipReordering = [
this]() {
2645 for (
const OperandData &
Data : Op0)
2649 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2668 if (SkipReordering())
2671 bool StrategyFailed =
false;
2679 for (
unsigned I = 0;
I < NumOperands; ++
I)
2680 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2683 UsedLanes.
set(FirstLane);
2684 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2687 int Lane = FirstLane +
Direction * Distance;
2688 if (Lane < 0 || Lane >= (
int)NumLanes)
2690 UsedLanes.
set(Lane);
2692 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2695 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2697 std::optional<unsigned> BestIdx =
2698 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2699 MainAltOps[OpIdx], UsedLanes);
2706 swap(OpIdx, *BestIdx, Lane);
2709 StrategyFailed =
true;
2712 if (MainAltOps[OpIdx].
size() != 2) {
2713 OperandData &AltOp = getData(OpIdx, Lane);
2714 InstructionsState OpS =
2716 if (OpS && OpS.isAltShuffle())
2723 if (!StrategyFailed)
2728#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2731 case ReorderingMode::Load:
2733 case ReorderingMode::Opcode:
2735 case ReorderingMode::Constant:
2737 case ReorderingMode::Splat:
2739 case ReorderingMode::Failed:
2760 const unsigned Indent = 2;
2763 OS <<
"Operand " << Cnt++ <<
"\n";
2764 for (
const OperandData &OpData : OpDataVec) {
2766 if (
Value *V = OpData.V)
2770 OS <<
", APO:" << OpData.APO <<
"}\n";
2792 int BestScore = Limit;
2793 std::optional<int> Index;
2794 for (
int I : seq<int>(0, Candidates.size())) {
2796 Candidates[
I].second,
2799 if (Score > BestScore) {
2814 DeletedInstructions.insert(
I);
2819 template <
typename T>
2822 for (
T *V : DeadVals) {
2823 auto *
I = cast<Instruction>(V);
2824 DeletedInstructions.insert(
I);
2827 for (
T *V : DeadVals) {
2828 if (!V || !Processed.
insert(V).second)
2830 auto *
I = cast<Instruction>(V);
2833 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2834 Entries.push_back(Entry);
2835 auto It = MultiNodeScalars.find(
I);
2836 if (It != MultiNodeScalars.end())
2837 Entries.append(It->second.begin(), It->second.end());
2839 for (
Use &U :
I->operands()) {
2840 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2841 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2843 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2844 return Entry->VectorizedValue == OpI;
2848 I->dropAllReferences();
2850 for (
T *V : DeadVals) {
2851 auto *
I = cast<Instruction>(V);
2852 if (!
I->getParent())
2857 cast<Instruction>(U.getUser()));
2859 "trying to erase instruction with users.");
2860 I->removeFromParent();
2864 while (!DeadInsts.
empty()) {
2867 if (!VI || !VI->getParent())
2870 "Live instruction found in dead worklist!");
2871 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2878 for (
Use &OpU : VI->operands()) {
2879 Value *OpV = OpU.get();
2890 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2891 if (!DeletedInstructions.contains(OpI) &&
2896 VI->removeFromParent();
2897 DeletedInstructions.insert(VI);
2905 return AnalyzedReductionsRoots.count(
I);
2910 AnalyzedReductionsRoots.insert(
I);
2924 AnalyzedReductionsRoots.clear();
2925 AnalyzedReductionVals.
clear();
2926 AnalyzedMinBWVals.
clear();
2938 return NonScheduledFirst.
contains(V);
2951 bool collectValuesToDemote(
2952 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2955 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2965 canReorderOperands(TreeEntry *UserTE,
2972 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2976 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2978 TreeEntry *TE =
nullptr;
2980 TE = getTreeEntry(V);
2981 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2983 auto It = MultiNodeScalars.find(V);
2984 if (It != MultiNodeScalars.end()) {
2985 for (TreeEntry *E : It->second) {
2986 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2994 if (It != VL.
end()) {
2995 assert(
TE->isSame(VL) &&
"Expected same scalars.");
3003 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
3004 unsigned OpIdx)
const {
3005 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
3006 const_cast<TreeEntry *
>(UserTE), OpIdx);
3010 bool areAllUsersVectorized(
3019 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3024 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3028 getCastContextHint(
const TreeEntry &TE)
const;
3037 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3048 bool ResizeAllowed =
false)
const;
3057 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3058 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3059 unsigned NodeIdx)
const {
3060 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3067 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3072 template <
typename BVTy,
typename ResTy,
typename...
Args>
3073 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3078 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3079 bool PostponedPHIs);
3085 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3092 std::optional<TargetTransformInfo::ShuffleKind>
3104 unsigned NumParts)
const;
3116 std::optional<TargetTransformInfo::ShuffleKind>
3117 isGatherShuffledSingleRegisterEntry(
3134 isGatherShuffledEntry(
3137 unsigned NumParts,
bool ForOrder =
false);
3143 Type *ScalarTy)
const;
3147 void setInsertPointAfterBundle(
const TreeEntry *E);
3157 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3162 void tryToVectorizeGatheredLoads(
3171 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3187 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3191 void reorderGatherNode(TreeEntry &TE);
3195 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3212 [Scalars](
Value *V,
int Idx) {
3213 return (isa<UndefValue>(V) &&
3214 Idx == PoisonMaskElem) ||
3215 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3218 if (!ReorderIndices.empty()) {
3225 return IsSame(Scalars, Mask);
3226 if (VL.
size() == ReuseShuffleIndices.size()) {
3228 return IsSame(Scalars, Mask);
3232 return IsSame(Scalars, ReuseShuffleIndices);
3235 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3236 return isGather() && !UserTreeIndices.empty() &&
3237 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3238 UserTreeIndices.front().UserTE == UserEI.UserTE;
3242 bool hasEqualOperands(
const TreeEntry &TE)
const {
3243 if (
TE.getNumOperands() != getNumOperands())
3246 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3247 unsigned PrevCount =
Used.count();
3248 for (
unsigned K = 0;
K < E; ++
K) {
3251 if (getOperand(K) ==
TE.getOperand(
I)) {
3257 if (PrevCount ==
Used.count())
3266 unsigned getVectorFactor()
const {
3267 if (!ReuseShuffleIndices.empty())
3268 return ReuseShuffleIndices.size();
3269 return Scalars.
size();
3273 bool isGather()
const {
return State == NeedToGather; }
3300 enum CombinedOpcode {
3302 MinMax = Instruction::OtherOpsEnd + 1,
3304 CombinedOpcode CombinedOp = NotCombinedOp;
3318 VecTreeTy &Container;
3342 unsigned InterleaveFactor = 0;
3346 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3348 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3354 assert(Operands[OpIdx].empty() &&
"Already resized?");
3356 "Number of operands is greater than the number of scalars.");
3362 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3363 VLOperands Ops(Scalars, MainOp, R);
3367 setOperand(
I, Ops.getVL(
I));
3389 unsigned getNumOperands()
const {
return Operands.size(); }
3392 Value *getSingleOperand(
unsigned OpIdx)
const {
3394 assert(!Operands[OpIdx].empty() &&
"No operand available");
3399 bool isAltShuffle()
const {
return MainOp != AltOp; }
3402 unsigned CheckedOpcode =
I->getOpcode();
3403 return (getOpcode() == CheckedOpcode ||
3404 getAltOpcode() == CheckedOpcode);
3411 auto *
I = dyn_cast<Instruction>(
Op);
3412 if (
I && isOpcodeOrAlt(
I))
3417 void setOperations(
const InstructionsState &S) {
3418 assert(S &&
"InstructionsState is invalid.");
3419 MainOp = S.getMainOp();
3420 AltOp = S.getAltOp();
3432 unsigned getOpcode()
const {
3433 return MainOp ? MainOp->
getOpcode() : 0;
3436 unsigned getAltOpcode()
const {
3442 int findLaneForValue(
Value *V)
const {
3443 unsigned FoundLane = getVectorFactor();
3444 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3445 std::advance(It, 1)) {
3448 FoundLane = std::distance(Scalars.begin(), It);
3449 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3450 if (!ReorderIndices.
empty())
3451 FoundLane = ReorderIndices[FoundLane];
3452 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3453 if (ReuseShuffleIndices.
empty())
3455 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3456 RIt != ReuseShuffleIndices.
end()) {
3457 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3461 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3474 bool isNonPowOf2Vec()
const {
3476 return IsNonPowerOf2;
3485 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3486 "Reshuffling not supported with non-power-of-2 vectors yet.");
3487 return IsNonPowerOf2;
3490 Value *getOrdered(
unsigned Idx)
const {
3491 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3492 if (ReorderIndices.
empty())
3493 return Scalars[
Idx];
3503 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3504 dbgs() <<
"Operand " << OpI <<
":\n";
3505 for (
const Value *V : Operands[OpI])
3508 dbgs() <<
"Scalars: \n";
3509 for (
Value *V : Scalars)
3511 dbgs() <<
"State: ";
3514 if (InterleaveFactor > 0) {
3515 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3518 dbgs() <<
"Vectorize\n";
3521 case ScatterVectorize:
3522 dbgs() <<
"ScatterVectorize\n";
3524 case StridedVectorize:
3525 dbgs() <<
"StridedVectorize\n";
3528 dbgs() <<
"NeedToGather\n";
3530 case CombinedVectorize:
3531 dbgs() <<
"CombinedVectorize\n";
3534 dbgs() <<
"MainOp: ";
3536 dbgs() << *MainOp <<
"\n";
3539 dbgs() <<
"AltOp: ";
3541 dbgs() << *AltOp <<
"\n";
3544 dbgs() <<
"VectorizedValue: ";
3545 if (VectorizedValue)
3546 dbgs() << *VectorizedValue <<
"\n";
3549 dbgs() <<
"ReuseShuffleIndices: ";
3550 if (ReuseShuffleIndices.
empty())
3553 for (
int ReuseIdx : ReuseShuffleIndices)
3554 dbgs() << ReuseIdx <<
", ";
3556 dbgs() <<
"ReorderIndices: ";
3557 for (
unsigned ReorderIdx : ReorderIndices)
3558 dbgs() << ReorderIdx <<
", ";
3560 dbgs() <<
"UserTreeIndices: ";
3561 for (
const auto &EInfo : UserTreeIndices)
3562 dbgs() << EInfo <<
", ";
3564 if (!CombinedEntriesWithIndices.
empty()) {
3565 dbgs() <<
"Combined entries: ";
3567 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3576 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3579 dbgs() <<
"SLP: " << Banner <<
":\n";
3581 dbgs() <<
"SLP: Costs:\n";
3582 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3583 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3584 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3585 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3586 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3592 std::optional<ScheduleData *> Bundle,
3593 const InstructionsState &S,
3594 const EdgeInfo &UserTreeIdx,
3597 unsigned InterleaveFactor = 0) {
3598 TreeEntry::EntryState EntryState =
3599 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3600 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3601 ReuseShuffleIndices, ReorderIndices);
3602 if (E && InterleaveFactor > 0)
3603 E->setInterleave(InterleaveFactor);
3608 TreeEntry::EntryState EntryState,
3609 std::optional<ScheduleData *> Bundle,
3610 const InstructionsState &S,
3611 const EdgeInfo &UserTreeIdx,
3614 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3615 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3616 "Need to vectorize gather entry?");
3618 if (GatheredLoadsEntriesFirst.has_value() &&
3619 EntryState == TreeEntry::NeedToGather && S &&
3620 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3621 !UserTreeIdx.UserTE)
3623 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3624 TreeEntry *
Last = VectorizableTree.
back().get();
3625 Last->Idx = VectorizableTree.
size() - 1;
3626 Last->State = EntryState;
3631 ReuseShuffleIndices.empty()) &&
3632 "Reshuffling scalars not yet supported for nodes with padding");
3633 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3634 ReuseShuffleIndices.end());
3635 if (ReorderIndices.
empty()) {
3638 Last->setOperations(S);
3641 Last->Scalars.assign(VL.
size(),
nullptr);
3644 if (Idx >= VL.size())
3645 return UndefValue::get(VL.front()->getType());
3650 Last->setOperations(S);
3651 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3653 if (!
Last->isGather()) {
3654 for (
Value *V : VL) {
3655 const TreeEntry *
TE = getTreeEntry(V);
3657 "Scalar already in tree!");
3660 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3663 ScalarToTreeEntry[
V] =
Last;
3666 ScheduleData *BundleMember = *Bundle;
3667 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3670 "Bundle and VL out of sync");
3672 for (
Value *V : VL) {
3677 BundleMember->TE =
Last;
3678 BundleMember = BundleMember->NextInBundle;
3681 assert(!BundleMember &&
"Bundle and VL out of sync");
3684 bool AllConstsOrCasts =
true;
3687 auto *
I = dyn_cast<CastInst>(V);
3688 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3689 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3690 !UserTreeIdx.UserTE->isGather())
3693 if (AllConstsOrCasts)
3695 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3696 MustGather.
insert(VL.begin(), VL.end());
3699 if (UserTreeIdx.UserTE)
3700 Last->UserTreeIndices.push_back(UserTreeIdx);
3706 TreeEntry::VecTreeTy VectorizableTree;
3711 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3712 VectorizableTree[
Id]->dump();
3718 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3720 const TreeEntry *getTreeEntry(
Value *V)
const {
3721 return ScalarToTreeEntry.lookup(V);
3730 bool areAltOperandsProfitable(
const InstructionsState &S,
3735 TreeEntry::EntryState
3737 bool IsScatterVectorizeUserTE,
3770 using ValueToGatherNodesMap =
3772 ValueToGatherNodesMap ValueToGatherNodes;
3780 bool IsGraphTransformMode =
false;
3783 std::optional<unsigned> GatheredLoadsEntriesFirst;
3786 struct ExternalUser {
3810 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3811 auto It = AliasCache.
find(Key);
3812 if (It != AliasCache.
end())
3817 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3821 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3853 UserList ExternalUses;
3876 struct ScheduleData {
3879 enum { InvalidDeps = -1 };
3881 ScheduleData() =
default;
3884 FirstInBundle =
this;
3885 NextInBundle =
nullptr;
3886 NextLoadStore =
nullptr;
3887 IsScheduled =
false;
3888 SchedulingRegionID = BlockSchedulingRegionID;
3889 clearDependencies();
3896 if (hasValidDependencies()) {
3897 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3899 assert(UnscheduledDeps == Dependencies &&
"invariant");
3903 assert(isSchedulingEntity() &&
3904 "unexpected scheduled state");
3905 for (
const ScheduleData *BundleMember =
this; BundleMember;
3906 BundleMember = BundleMember->NextInBundle) {
3907 assert(BundleMember->hasValidDependencies() &&
3908 BundleMember->UnscheduledDeps == 0 &&
3909 "unexpected scheduled state");
3910 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3911 "only bundle is marked scheduled");
3915 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3916 "all bundle members must be in same basic block");
3922 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3926 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3930 bool isPartOfBundle()
const {
3931 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3936 bool isReady()
const {
3937 assert(isSchedulingEntity() &&
3938 "can't consider non-scheduling entity for ready list");
3939 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3945 int incrementUnscheduledDeps(
int Incr) {
3946 assert(hasValidDependencies() &&
3947 "increment of unscheduled deps would be meaningless");
3948 UnscheduledDeps += Incr;
3949 return FirstInBundle->unscheduledDepsInBundle();
3954 void resetUnscheduledDeps() {
3955 UnscheduledDeps = Dependencies;
3959 void clearDependencies() {
3960 Dependencies = InvalidDeps;
3961 resetUnscheduledDeps();
3962 MemoryDependencies.clear();
3963 ControlDependencies.clear();
3966 int unscheduledDepsInBundle()
const {
3967 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3969 for (
const ScheduleData *BundleMember =
this; BundleMember;
3970 BundleMember = BundleMember->NextInBundle) {
3971 if (BundleMember->UnscheduledDeps == InvalidDeps)
3973 Sum += BundleMember->UnscheduledDeps;
3979 if (!isSchedulingEntity()) {
3980 os <<
"/ " << *Inst;
3981 }
else if (NextInBundle) {
3983 ScheduleData *SD = NextInBundle;
3985 os <<
';' << *SD->Inst;
3986 SD = SD->NextInBundle;
3997 TreeEntry *
TE =
nullptr;
4001 ScheduleData *FirstInBundle =
nullptr;
4005 ScheduleData *NextInBundle =
nullptr;
4009 ScheduleData *NextLoadStore =
nullptr;
4023 int SchedulingRegionID = 0;
4026 int SchedulingPriority = 0;
4032 int Dependencies = InvalidDeps;
4038 int UnscheduledDeps = InvalidDeps;
4042 bool IsScheduled =
false;
4047 const BoUpSLP::ScheduleData &SD) {
4072 struct BlockScheduling {
4074 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4078 ScheduleStart =
nullptr;
4079 ScheduleEnd =
nullptr;
4080 FirstLoadStoreInRegion =
nullptr;
4081 LastLoadStoreInRegion =
nullptr;
4082 RegionHasStackSave =
false;
4086 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4089 ScheduleRegionSize = 0;
4093 ++SchedulingRegionID;
4097 if (BB !=
I->getParent())
4100 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4101 if (SD && isInSchedulingRegion(SD))
4106 ScheduleData *getScheduleData(
Value *V) {
4107 if (
auto *
I = dyn_cast<Instruction>(V))
4108 return getScheduleData(
I);
4112 bool isInSchedulingRegion(ScheduleData *SD)
const {
4113 return SD->SchedulingRegionID == SchedulingRegionID;
4118 template <
typename ReadyListType>
4119 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4120 SD->IsScheduled =
true;
4123 for (ScheduleData *BundleMember = SD; BundleMember;
4124 BundleMember = BundleMember->NextInBundle) {
4129 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4130 ScheduleData *OpDef = getScheduleData(
I);
4131 if (OpDef && OpDef->hasValidDependencies() &&
4132 OpDef->incrementUnscheduledDeps(-1) == 0) {
4136 ScheduleData *DepBundle = OpDef->FirstInBundle;
4137 assert(!DepBundle->IsScheduled &&
4138 "already scheduled bundle gets ready");
4139 ReadyList.insert(DepBundle);
4141 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4148 if (TreeEntry *TE = BundleMember->TE) {
4150 int Lane = std::distance(
TE->Scalars.begin(),
4151 find(
TE->Scalars, BundleMember->Inst));
4152 assert(Lane >= 0 &&
"Lane not set");
4160 auto *
In = BundleMember->Inst;
4163 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4164 In->getNumOperands() ==
TE->getNumOperands()) &&
4165 "Missed TreeEntry operands?");
4168 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4169 OpIdx != NumOperands; ++OpIdx)
4170 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4175 for (
Use &U : BundleMember->Inst->operands())
4176 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4180 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4181 if (MemoryDepSD->hasValidDependencies() &&
4182 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4185 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4186 assert(!DepBundle->IsScheduled &&
4187 "already scheduled bundle gets ready");
4188 ReadyList.insert(DepBundle);
4190 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4194 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4195 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4198 ScheduleData *DepBundle = DepSD->FirstInBundle;
4199 assert(!DepBundle->IsScheduled &&
4200 "already scheduled bundle gets ready");
4201 ReadyList.insert(DepBundle);
4203 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4214 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4215 ScheduleStart->comesBefore(ScheduleEnd) &&
4216 "Not a valid scheduling region?");
4218 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4219 auto *SD = getScheduleData(
I);
4222 assert(isInSchedulingRegion(SD) &&
4223 "primary schedule data not in window?");
4224 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4225 "entire bundle in window!");
4229 for (
auto *SD : ReadyInsts) {
4230 assert(SD->isSchedulingEntity() && SD->isReady() &&
4231 "item in ready list not ready?");
4237 template <
typename ReadyListType>
4238 void initialFillReadyList(ReadyListType &ReadyList) {
4239 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4240 ScheduleData *SD = getScheduleData(
I);
4241 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4243 ReadyList.insert(SD);
4245 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4259 std::optional<ScheduleData *>
4261 const InstructionsState &S);
4267 ScheduleData *allocateScheduleDataChunks();
4271 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4276 ScheduleData *PrevLoadStore,
4277 ScheduleData *NextLoadStore);
4281 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4285 void resetSchedule();
4315 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4319 ScheduleData *LastLoadStoreInRegion =
nullptr;
4324 bool RegionHasStackSave =
false;
4327 int ScheduleRegionSize = 0;
4336 int SchedulingRegionID = 1;
4344 void scheduleBlock(BlockScheduling *BS);
4351 struct OrdersTypeDenseMapInfo {
4364 static unsigned getHashValue(
const OrdersType &V) {
4385 unsigned MaxVecRegSize;
4386 unsigned MinVecRegSize;
4401 unsigned ReductionBitWidth = 0;
4404 unsigned BaseGraphSize = 1;
4408 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4427 struct ChildIteratorType
4429 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4440 return R.VectorizableTree[0].get();
4444 return {
N->UserTreeIndices.begin(),
N->Container};
4448 return {
N->UserTreeIndices.end(),
N->Container};
4453 class nodes_iterator {
4464 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4468 return nodes_iterator(R->VectorizableTree.begin());
4472 return nodes_iterator(R->VectorizableTree.end());
4475 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4486 OS << Entry->Idx <<
".\n";
4489 for (
auto *V : Entry->Scalars) {
4491 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4492 return EU.Scalar == V;
4502 if (Entry->isGather())
4504 if (Entry->State == TreeEntry::ScatterVectorize ||
4505 Entry->State == TreeEntry::StridedVectorize)
4506 return "color=blue";
4515 for (
auto *
I : DeletedInstructions) {
4516 if (!
I->getParent()) {
4519 if (isa<PHINode>(
I))
4521 I->insertBefore(
F->getEntryBlock(),
4522 F->getEntryBlock().getFirstNonPHIIt());
4524 I->insertBefore(
F->getEntryBlock().getTerminator());
4527 for (
Use &U :
I->operands()) {
4528 auto *
Op = dyn_cast<Instruction>(U.get());
4529 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4533 I->dropAllReferences();
4535 for (
auto *
I : DeletedInstructions) {
4537 "trying to erase instruction with users.");
4538 I->eraseFromParent();
4544#ifdef EXPENSIVE_CHECKS
4555 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4556 "Expected non-empty mask.");
4559 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4561 Reuses[Mask[
I]] = Prev[
I];
4569 bool BottomOrder =
false) {
4570 assert(!Mask.empty() &&
"Expected non-empty mask.");
4571 unsigned Sz = Mask.size();
4574 if (Order.
empty()) {
4576 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4578 PrevOrder.
swap(Order);
4581 for (
unsigned I = 0;
I < Sz; ++
I)
4583 Order[
I] = PrevOrder[Mask[
I]];
4585 return Data.value() == Sz ||
Data.index() ==
Data.value();
4594 if (Order.
empty()) {
4596 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4606 for (
unsigned I = 0;
I < Sz; ++
I)
4608 Order[MaskOrder[
I]] =
I;
4612std::optional<BoUpSLP::OrdersType>
4614 assert(TE.isGather() &&
"Expected gather node only.");
4618 Type *ScalarTy = GatheredScalars.
front()->getType();
4619 int NumScalars = GatheredScalars.
size();
4621 return std::nullopt;
4624 if (NumParts == 0 || NumParts >= NumScalars ||
4625 VecTy->getNumElements() % NumParts != 0 ||
4627 VecTy->getNumElements() / NumParts))
4633 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4635 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4638 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4639 return std::nullopt;
4640 OrdersType CurrentOrder(NumScalars, NumScalars);
4641 if (GatherShuffles.
size() == 1 &&
4643 Entries.front().front()->isSame(TE.Scalars)) {
4646 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4647 return CurrentOrder;
4651 return all_of(Mask, [&](
int I) {
4658 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4659 (Entries.size() != 1 ||
4660 Entries.front().front()->ReorderIndices.empty())) ||
4661 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4662 return std::nullopt;
4667 for (
int I : seq<int>(0, NumParts)) {
4668 if (ShuffledSubMasks.
test(
I))
4670 const int VF = GetVF(
I);
4676 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4677 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4678 ShuffledSubMasks.
set(
I);
4682 int FirstMin = INT_MAX;
4683 int SecondVecFound =
false;
4684 for (
int K : seq<int>(Limit)) {
4685 int Idx = Mask[
I * PartSz + K];
4687 Value *V = GatheredScalars[
I * PartSz + K];
4689 SecondVecFound =
true;
4698 SecondVecFound =
true;
4702 FirstMin = (FirstMin / PartSz) * PartSz;
4704 if (SecondVecFound) {
4705 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4706 ShuffledSubMasks.
set(
I);
4709 for (
int K : seq<int>(Limit)) {
4710 int Idx = Mask[
I * PartSz + K];
4714 if (
Idx >= PartSz) {
4715 SecondVecFound =
true;
4718 if (CurrentOrder[
I * PartSz +
Idx] >
4719 static_cast<unsigned>(
I * PartSz + K) &&
4720 CurrentOrder[
I * PartSz +
Idx] !=
4721 static_cast<unsigned>(
I * PartSz +
Idx))
4722 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4725 if (SecondVecFound) {
4726 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4727 ShuffledSubMasks.
set(
I);
4733 if (!ExtractShuffles.
empty())
4734 TransformMaskToOrder(
4735 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4736 if (!ExtractShuffles[
I])
4739 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4740 for (
unsigned Idx : seq<unsigned>(Sz)) {
4741 int K =
I * PartSz +
Idx;
4744 if (!TE.ReuseShuffleIndices.empty())
4745 K = TE.ReuseShuffleIndices[K];
4748 if (!TE.ReorderIndices.empty())
4749 K = std::distance(TE.ReorderIndices.begin(),
4750 find(TE.ReorderIndices, K));
4751 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4754 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4756 .getKnownMinValue());
4761 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4762 if (ShuffledSubMasks.
any())
4763 return std::nullopt;
4764 PartSz = NumScalars;
4767 if (!Entries.empty())
4768 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4769 if (!GatherShuffles[
I])
4771 return std::max(Entries[
I].front()->getVectorFactor(),
4772 Entries[
I].back()->getVectorFactor());
4775 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4776 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4777 return std::nullopt;
4778 return std::move(CurrentOrder);
4783 bool CompareOpcodes =
true) {
4787 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4788 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4789 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4790 (!GEP2 || GEP2->getNumOperands() == 2) &&
4791 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4792 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4795 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4799template <
typename T>
4801 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4803 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4804 return CommonAlignment;
4810 "Order is empty. Please check it before using isReverseOrder.");
4811 unsigned Sz = Order.
size();
4813 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4824static std::optional<Value *>
4830 const SCEV *PtrSCEVLowest =
nullptr;
4831 const SCEV *PtrSCEVHighest =
nullptr;
4837 return std::nullopt;
4839 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4840 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4844 if (isa<SCEVCouldNotCompute>(Diff))
4845 return std::nullopt;
4847 PtrSCEVLowest = PtrSCEV;
4851 if (isa<SCEVCouldNotCompute>(Diff1))
4852 return std::nullopt;
4854 PtrSCEVHighest = PtrSCEV;
4860 if (isa<SCEVCouldNotCompute>(Dist))
4861 return std::nullopt;
4862 int Size =
DL.getTypeStoreSize(ElemTy);
4863 auto TryGetStride = [&](
const SCEV *Dist,
4864 const SCEV *Multiplier) ->
const SCEV * {
4865 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4866 if (M->getOperand(0) == Multiplier)
4867 return M->getOperand(1);
4868 if (M->getOperand(1) == Multiplier)
4869 return M->getOperand(0);
4872 if (Multiplier == Dist)
4877 const SCEV *Stride =
nullptr;
4878 if (
Size != 1 || SCEVs.
size() > 2) {
4880 Stride = TryGetStride(Dist, Sz);
4882 return std::nullopt;
4884 if (!Stride || isa<SCEVConstant>(Stride))
4885 return std::nullopt;
4888 using DistOrdPair = std::pair<int64_t, int>;
4890 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4892 bool IsConsecutive =
true;
4893 for (
const SCEV *PtrSCEV : SCEVs) {
4895 if (PtrSCEV != PtrSCEVLowest) {
4897 const SCEV *Coeff = TryGetStride(Diff, Stride);
4899 return std::nullopt;
4900 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4901 if (!SC || isa<SCEVCouldNotCompute>(SC))
4902 return std::nullopt;
4906 return std::nullopt;
4907 Dist = SC->getAPInt().getZExtValue();
4911 return std::nullopt;
4912 auto Res = Offsets.emplace(Dist, Cnt);
4914 return std::nullopt;
4916 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4919 if (Offsets.size() != SCEVs.
size())
4920 return std::nullopt;
4921 SortedIndices.
clear();
4922 if (!IsConsecutive) {
4926 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4927 SortedIndices[Cnt] = Pair.second;
4937static std::pair<InstructionCost, InstructionCost>
4953 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4956 Mask, NumSrcElts, NumSubElts,
Index)) {
4957 if (
Index + NumSubElts > NumSrcElts &&
4958 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4974 if (
Index % SubVecVF == 0) {
4983 for (
unsigned I : seq<unsigned>(SubVecVF))
4986 Vec = Generator(Vec, V, Mask);
4990 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
5002 unsigned *BestVF,
bool TryRecursiveCheck)
const {
5015 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5021 const unsigned Sz = VL.
size();
5023 auto *POIter = PointerOps.
begin();
5024 for (
Value *V : VL) {
5025 auto *L = dyn_cast<LoadInst>(V);
5026 if (!L || !L->isSimple())
5028 *POIter = L->getPointerOperand();
5037 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5057 if (Order.
empty()) {
5058 Ptr0 = PointerOps.
front();
5059 PtrN = PointerOps.
back();
5061 Ptr0 = PointerOps[Order.
front()];
5062 PtrN = PointerOps[Order.
back()];
5064 std::optional<int> Diff =
5067 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5073 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5087 auto IsAnyPointerUsedOutGraph =
5088 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5089 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5090 return !getTreeEntry(U) && !MustGather.contains(U);
5093 const unsigned AbsoluteDiff = std::abs(*Diff);
5094 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5098 AbsoluteDiff > Sz) ||
5099 *Diff == -(
static_cast<int>(Sz) - 1))) {
5100 int Stride = *Diff /
static_cast<int>(Sz - 1);
5101 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5113 else if (
Ptr != Ptr0)
5117 if (((Dist / Stride) * Stride) != Dist ||
5118 !Dists.
insert(Dist).second)
5121 if (Dists.
size() == Sz)
5130 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5132 bool ProfitableGatherPointers) {
5137 auto [ScalarGEPCost, VectorGEPCost] =
5139 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5145 VecTy->getNumElements());
5146 if (
static_cast<unsigned>(
count_if(
5147 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5153 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5172 false, CommonAlignment,
CostKind) +
5173 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5180 constexpr unsigned ListLimit = 4;
5181 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5190 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5195 for (
unsigned VF = VL.
size() / 2; VF >= MinVF; VF /= 2) {
5197 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5210 DemandedElts.
setBits(Cnt, Cnt + VF);
5225 if (!DemandedElts.
isZero()) {
5230 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5231 if (DemandedElts[
Idx])
5238 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5243 LI0->getPointerOperand(),
5244 Instruction::GetElementPtr,
CostKind, ScalarTy,
5248 if (
static_cast<unsigned>(
5249 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5250 PointerOps.
size() - 1 ||
5270 LI0->getPointerAddressSpace(),
CostKind,
5276 LI0->getPointerOperand(),
5283 LI0->getPointerOperand(),
5293 for (
int Idx : seq<int>(0, VL.
size()))
5303 if (MaskedGatherCost >= VecLdCost &&
5316 bool ProfitableGatherPointers =
5317 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5318 return L->isLoopInvariant(V);
5320 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5321 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5323 (
GEP &&
GEP->getNumOperands() == 2 &&
5324 isa<Constant, Instruction>(
GEP->getOperand(1)));
5331 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5332 ProfitableGatherPointers))
5345 "Expected list of pointer operands.");
5355 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5357 SortedIndices.
clear();
5359 auto Key = std::make_pair(BBs[Cnt + 1],
5363 std::optional<int> Diff = getPointersDiff(
5364 ElemTy, std::get<0>(Base.front()), ElemTy,
5370 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5376 if (Bases.
size() > VL.
size() / 2 - 1)
5380 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5387 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5388 Bases.
front().second.size() == VL.
size()))
5393 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5402 FirstPointers.
insert(P1);
5403 SecondPointers.
insert(P2);
5409 "Unable to find matching root.");
5412 for (
auto &
Base : Bases) {
5413 for (
auto &Vec :
Base.second) {
5414 if (Vec.size() > 1) {
5415 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5416 const std::tuple<Value *, int, unsigned> &
Y) {
5417 return std::get<1>(
X) < std::get<1>(
Y);
5419 int InitialOffset = std::get<1>(Vec[0]);
5420 bool AnyConsecutive =
5422 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5426 if (!AnyConsecutive)
5431 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5435 for (
auto &
T : Bases)
5436 for (
const auto &Vec :
T.second)
5437 for (
const auto &
P : Vec)
5441 "Expected SortedIndices to be the size of VL");
5445std::optional<BoUpSLP::OrdersType>
5447 assert(TE.isGather() &&
"Expected gather node only.");
5448 Type *ScalarTy = TE.Scalars[0]->getType();
5451 Ptrs.
reserve(TE.Scalars.size());
5453 BBs.
reserve(TE.Scalars.size());
5454 for (
Value *V : TE.Scalars) {
5455 auto *L = dyn_cast<LoadInst>(V);
5456 if (!L || !L->isSimple())
5457 return std::nullopt;
5463 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5465 return std::move(Order);
5466 return std::nullopt;
5477 if (VU->
getType() != V->getType())
5480 if (!VU->
hasOneUse() && !V->hasOneUse())
5486 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5492 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5493 bool IsReusedIdx =
false;
5495 if (IE2 == VU && !IE1)
5497 if (IE1 == V && !IE2)
5498 return V->hasOneUse();
5499 if (IE1 && IE1 != V) {
5501 IsReusedIdx |= ReusedIdx.
test(Idx1);
5502 ReusedIdx.
set(Idx1);
5503 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5506 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5508 if (IE2 && IE2 != VU) {
5510 IsReusedIdx |= ReusedIdx.
test(Idx2);
5511 ReusedIdx.
set(Idx2);
5512 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5515 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5517 }
while (!IsReusedIdx && (IE1 || IE2));
5521std::optional<BoUpSLP::OrdersType>
5525 if (!TE.ReuseShuffleIndices.empty()) {
5527 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5528 "Reshuffling scalars not yet supported for nodes with padding");
5531 return std::nullopt;
5539 unsigned Sz = TE.Scalars.size();
5540 if (TE.isGather()) {
5541 if (std::optional<OrdersType> CurrentOrder =
5546 ::addMask(Mask, TE.ReuseShuffleIndices);
5547 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5548 unsigned Sz = TE.Scalars.size();
5549 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5552 Res[
Idx + K * Sz] =
I + K * Sz;
5554 return std::move(Res);
5557 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5559 2 * TE.getVectorFactor())) == 1)
5560 return std::nullopt;
5564 if (TE.ReorderIndices.empty())
5565 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5568 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5569 unsigned VF = ReorderMask.
size();
5573 for (
unsigned I = 0;
I < VF;
I += Sz) {
5575 unsigned UndefCnt = 0;
5576 unsigned Limit = std::min(Sz, VF -
I);
5585 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5587 return std::nullopt;
5589 for (
unsigned K = 0; K < NumParts; ++K) {
5590 unsigned Idx = Val + Sz * K;
5592 ResOrder[
Idx] =
I + K;
5595 return std::move(ResOrder);
5597 unsigned VF = TE.getVectorFactor();
5600 TE.ReuseShuffleIndices.end());
5601 if (TE.getOpcode() == Instruction::ExtractElement &&
5603 if (isa<PoisonValue>(V))
5605 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5606 return Idx && *Idx < Sz;
5608 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5609 "by BinaryOperator and CastInst.");
5611 if (TE.ReorderIndices.empty())
5612 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5615 for (
unsigned I = 0;
I < VF; ++
I) {
5616 int &
Idx = ReusedMask[
I];
5619 Value *V = TE.Scalars[ReorderMask[
Idx]];
5621 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5627 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5628 auto *It = ResOrder.
begin();
5629 for (
unsigned K = 0; K < VF; K += Sz) {
5633 std::iota(SubMask.begin(), SubMask.end(), 0);
5635 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5636 std::advance(It, Sz);
5639 return Data.index() ==
Data.value();
5641 return std::nullopt;
5642 return std::move(ResOrder);
5644 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5645 any_of(TE.UserTreeIndices,
5647 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5649 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5650 return std::nullopt;
5651 if ((TE.State == TreeEntry::Vectorize ||
5652 TE.State == TreeEntry::StridedVectorize) &&
5653 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5654 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5655 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5656 "BinaryOperator and CastInst.");
5657 return TE.ReorderIndices;
5659 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5660 if (!TE.ReorderIndices.empty())
5661 return TE.ReorderIndices;
5664 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5665 if (!V->hasNUsesOrMore(1))
5667 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5672 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5674 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5680 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5681 auto *NodeA = DT->
getNode(BB1);
5682 auto *NodeB = DT->
getNode(BB2);
5683 assert(NodeA &&
"Should only process reachable instructions");
5684 assert(NodeB &&
"Should only process reachable instructions");
5685 assert((NodeA == NodeB) ==
5686 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5687 "Different nodes should have different DFS numbers");
5688 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5690 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5691 Value *V1 = TE.Scalars[I1];
5692 Value *V2 = TE.Scalars[I2];
5693 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5695 if (isa<PoisonValue>(V1))
5697 if (isa<PoisonValue>(V2))
5703 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5704 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5705 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5706 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5707 FirstUserOfPhi2->getParent());
5708 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5709 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5710 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5711 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5717 if (UserBVHead[I1] && !UserBVHead[I2])
5719 if (!UserBVHead[I1])
5721 if (UserBVHead[I1] == UserBVHead[I2])
5724 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5726 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5733 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5734 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5735 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5736 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5739 if (EE1->getOperand(0) == EE2->getOperand(0))
5741 if (!Inst1 && Inst2)
5743 if (Inst1 && Inst2) {
5751 "Expected either instructions or arguments vector operands.");
5752 return P1->getArgNo() < P2->getArgNo();
5757 std::iota(Phis.
begin(), Phis.
end(), 0);
5760 return std::nullopt;
5761 return std::move(Phis);
5763 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5766 if ((TE.getOpcode() == Instruction::ExtractElement ||
5767 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5768 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5770 auto *EE = dyn_cast<ExtractElementInst>(V);
5771 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5776 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5778 if (Reuse || !CurrentOrder.
empty())
5779 return std::move(CurrentOrder);
5787 int Sz = TE.Scalars.size();
5789 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5791 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5792 if (It == TE.Scalars.begin())
5795 if (It != TE.Scalars.end()) {
5797 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5812 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5815 return std::move(Order);
5820 return std::nullopt;
5821 if (TE.Scalars.size() >= 3)
5826 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5830 CurrentOrder, PointerOps);
5832 return std::move(CurrentOrder);
5838 return CurrentOrder;
5840 return std::nullopt;
5850 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5852 if (Cluster != FirstCluster)
5858void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5861 const unsigned Sz =
TE.Scalars.size();
5863 if (!
TE.isGather() ||
5870 addMask(NewMask,
TE.ReuseShuffleIndices);
5872 TE.ReorderIndices.clear();
5879 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5880 *
End =
TE.ReuseShuffleIndices.end();
5881 It !=
End; std::advance(It, Sz))
5882 std::iota(It, std::next(It, Sz), 0);
5888 "Expected same size of orders");
5889 unsigned Sz = Order.
size();
5891 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5892 if (Order[
Idx] != Sz)
5893 UsedIndices.
set(Order[
Idx]);
5895 if (SecondaryOrder.
empty()) {
5896 for (
unsigned Idx : seq<unsigned>(0, Sz))
5897 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5900 for (
unsigned Idx : seq<unsigned>(0, Sz))
5901 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5902 !UsedIndices.
test(SecondaryOrder[
Idx]))
5903 Order[
Idx] = SecondaryOrder[
Idx];
5923 ExternalUserReorderMap;
5928 const std::unique_ptr<TreeEntry> &TE) {
5931 findExternalStoreUsersReorderIndices(TE.get());
5932 if (!ExternalUserReorderIndices.
empty()) {
5933 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5935 std::move(ExternalUserReorderIndices));
5941 if (TE->isAltShuffle()) {
5944 unsigned Opcode0 = TE->getOpcode();
5945 unsigned Opcode1 = TE->getAltOpcode();
5948 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5949 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5955 if (std::optional<OrdersType> CurrentOrder =
5965 const TreeEntry *UserTE = TE.get();
5967 if (UserTE->UserTreeIndices.size() != 1)
5970 return EI.UserTE->State == TreeEntry::Vectorize &&
5971 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5974 UserTE = UserTE->UserTreeIndices.back().UserTE;
5977 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5978 if (!(TE->State == TreeEntry::Vectorize ||
5979 TE->State == TreeEntry::StridedVectorize) ||
5980 !TE->ReuseShuffleIndices.empty())
5981 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5982 if (TE->State == TreeEntry::Vectorize &&
5983 TE->getOpcode() == Instruction::PHI)
5984 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5989 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
5990 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5991 auto It = VFToOrderedEntries.
find(VF);
5992 if (It == VFToOrderedEntries.
end())
6007 for (
const TreeEntry *OpTE : OrderedEntries) {
6010 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6013 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6015 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6016 auto It = GathersToOrders.find(OpTE);
6017 if (It != GathersToOrders.end())
6020 if (OpTE->isAltShuffle()) {
6021 auto It = AltShufflesToOrders.find(OpTE);
6022 if (It != AltShufflesToOrders.end())
6025 if (OpTE->State == TreeEntry::Vectorize &&
6026 OpTE->getOpcode() == Instruction::PHI) {
6027 auto It = PhisToOrders.
find(OpTE);
6028 if (It != PhisToOrders.
end())
6031 return OpTE->ReorderIndices;
6034 auto It = ExternalUserReorderMap.
find(OpTE);
6035 if (It != ExternalUserReorderMap.
end()) {
6036 const auto &ExternalUserReorderIndices = It->second;
6040 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6041 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6042 ExternalUserReorderIndices.size();
6044 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
6045 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6052 if (OpTE->State == TreeEntry::Vectorize &&
6053 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6054 assert(!OpTE->isAltShuffle() &&
6055 "Alternate instructions are only supported by BinaryOperator "
6059 unsigned E = Order.size();
6062 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6065 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6067 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6070 if (OrdersUses.empty())
6073 unsigned IdentityCnt = 0;
6074 unsigned FilledIdentityCnt = 0;
6076 for (
auto &Pair : OrdersUses) {
6078 if (!Pair.first.empty())
6079 FilledIdentityCnt += Pair.second;
6080 IdentityCnt += Pair.second;
6085 unsigned Cnt = IdentityCnt;
6086 for (
auto &Pair : OrdersUses) {
6090 if (Cnt < Pair.second ||
6091 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6092 Cnt == Pair.second && !BestOrder.
empty() &&
6095 BestOrder = Pair.first;
6108 unsigned E = BestOrder.
size();
6110 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6113 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6115 if (TE->Scalars.size() != VF) {
6116 if (TE->ReuseShuffleIndices.size() == VF) {
6122 return EI.UserTE->Scalars.size() == VF ||
6123 EI.UserTE->Scalars.size() ==
6126 "All users must be of VF size.");
6134 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6139 return isa<ShuffleVectorInst>(
6140 EI.UserTE->getMainOp());
6142 "Does not know how to reorder.");
6146 reorderNodeWithReuses(*TE, Mask);
6150 if ((TE->State == TreeEntry::Vectorize ||
6151 TE->State == TreeEntry::StridedVectorize) &&
6154 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6155 assert(!TE->isAltShuffle() &&
6156 "Alternate instructions are only supported by BinaryOperator "
6161 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6162 TE->reorderOperands(Mask);
6165 TE->reorderOperands(Mask);
6166 assert(TE->ReorderIndices.empty() &&
6167 "Expected empty reorder sequence.");
6170 if (!TE->ReuseShuffleIndices.empty()) {
6177 addMask(NewReuses, TE->ReuseShuffleIndices);
6178 TE->ReuseShuffleIndices.swap(NewReuses);
6184bool BoUpSLP::canReorderOperands(
6185 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6188 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6189 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6190 return OpData.first ==
I &&
6191 (OpData.second->State == TreeEntry::Vectorize ||
6192 OpData.second->State == TreeEntry::StridedVectorize);
6195 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6197 if (
any_of(TE->UserTreeIndices,
6198 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6202 Edges.emplace_back(
I, TE);
6208 if (TE->State != TreeEntry::Vectorize &&
6209 TE->State != TreeEntry::StridedVectorize &&
6210 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6214 TreeEntry *
Gather =
nullptr;
6216 [&
Gather, UserTE,
I](TreeEntry *TE) {
6217 assert(TE->State != TreeEntry::Vectorize &&
6218 TE->State != TreeEntry::StridedVectorize &&
6219 "Only non-vectorized nodes are expected.");
6220 if (
any_of(TE->UserTreeIndices,
6221 [UserTE,
I](
const EdgeInfo &EI) {
6222 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6224 assert(TE->isSame(UserTE->getOperand(
I)) &&
6225 "Operand entry does not match operands.");
6246 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6247 if (TE->State != TreeEntry::Vectorize &&
6248 TE->State != TreeEntry::StridedVectorize)
6250 if (std::optional<OrdersType> CurrentOrder =
6252 OrderedEntries.
insert(TE.get());
6253 if (!(TE->State == TreeEntry::Vectorize ||
6254 TE->State == TreeEntry::StridedVectorize) ||
6255 !TE->ReuseShuffleIndices.empty())
6256 GathersToOrders.
insert(TE.get());
6265 while (!OrderedEntries.
empty()) {
6270 for (TreeEntry *TE : OrderedEntries) {
6271 if (!(TE->State == TreeEntry::Vectorize ||
6272 TE->State == TreeEntry::StridedVectorize ||
6273 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6274 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6277 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6279 !Visited.
insert(TE).second) {
6285 for (
EdgeInfo &EI : TE->UserTreeIndices)
6289 for (TreeEntry *TE : Filtered)
6290 OrderedEntries.remove(TE);
6292 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6294 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6295 return Data1.first->Idx > Data2.first->Idx;
6297 for (
auto &
Data : UsersVec) {
6300 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6302 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6303 OrderedEntries.remove(
Op.second);
6316 for (
const auto &
Op :
Data.second) {
6317 TreeEntry *OpTE =
Op.second;
6318 if (!VisitedOps.
insert(OpTE).second)
6320 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6322 const auto Order = [&]() ->
const OrdersType {
6323 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6326 return OpTE->ReorderIndices;
6330 if (Order.size() == 1)
6333 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6334 return P.second == OpTE;
6337 if (OpTE->State == TreeEntry::Vectorize &&
6338 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6339 assert(!OpTE->isAltShuffle() &&
6340 "Alternate instructions are only supported by BinaryOperator "
6344 unsigned E = Order.size();
6347 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6350 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6353 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6355 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6356 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6357 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6358 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6359 (IgnoreReorder && TE->Idx == 0))
6361 if (TE->isGather()) {
6370 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6371 TreeEntry *UserTE = EI.
UserTE;
6372 if (!VisitedUsers.
insert(UserTE).second)
6377 if (AllowsReordering(UserTE))
6385 if (
static_cast<unsigned>(
count_if(
6386 Ops, [UserTE, &AllowsReordering](
6387 const std::pair<unsigned, TreeEntry *> &
Op) {
6388 return AllowsReordering(
Op.second) &&
6391 return EI.UserTE == UserTE;
6393 })) <= Ops.
size() / 2)
6394 ++Res.first->second;
6397 if (OrdersUses.empty()) {
6398 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6399 OrderedEntries.remove(
Op.second);
6403 unsigned IdentityCnt = 0;
6404 unsigned VF =
Data.second.front().second->getVectorFactor();
6406 for (
auto &Pair : OrdersUses) {
6408 IdentityCnt += Pair.second;
6413 unsigned Cnt = IdentityCnt;
6414 for (
auto &Pair : OrdersUses) {
6418 if (Cnt < Pair.second) {
6420 BestOrder = Pair.first;
6428 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6429 OrderedEntries.remove(
Op.second);
6438 unsigned E = BestOrder.
size();
6440 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6442 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6443 TreeEntry *TE =
Op.second;
6444 OrderedEntries.remove(TE);
6445 if (!VisitedOps.
insert(TE).second)
6447 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6448 reorderNodeWithReuses(*TE, Mask);
6452 if (TE->State != TreeEntry::Vectorize &&
6453 TE->State != TreeEntry::StridedVectorize &&
6454 (TE->State != TreeEntry::ScatterVectorize ||
6455 TE->ReorderIndices.empty()))
6457 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6458 TE->ReorderIndices.empty()) &&
6459 "Non-matching sizes of user/operand entries.");
6461 if (IgnoreReorder && TE == VectorizableTree.front().get())
6462 IgnoreReorder =
false;
6465 for (TreeEntry *
Gather : GatherOps) {
6467 "Unexpected reordering of gathers.");
6468 if (!
Gather->ReuseShuffleIndices.empty()) {
6474 OrderedEntries.remove(
Gather);
6478 if (
Data.first->State != TreeEntry::Vectorize ||
6479 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6480 Data.first->getMainOp()) ||
6481 Data.first->isAltShuffle())
6482 Data.first->reorderOperands(Mask);
6483 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6484 Data.first->isAltShuffle() ||
6485 Data.first->State == TreeEntry::StridedVectorize) {
6489 if (
Data.first->ReuseShuffleIndices.empty() &&
6490 !
Data.first->ReorderIndices.empty() &&
6491 !
Data.first->isAltShuffle()) {
6494 OrderedEntries.insert(
Data.first);
6502 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6503 VectorizableTree.front()->ReuseShuffleIndices.empty())
6504 VectorizableTree.front()->ReorderIndices.clear();
6507Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6508 if ((Entry.getOpcode() == Instruction::Store ||
6509 Entry.getOpcode() == Instruction::Load) &&
6510 Entry.State == TreeEntry::StridedVectorize &&
6511 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6512 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6513 return dyn_cast<Instruction>(Entry.Scalars.front());
6520 for (
auto &TEPtr : VectorizableTree) {
6521 TreeEntry *Entry = TEPtr.get();
6524 if (Entry->isGather())
6528 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6529 Value *Scalar = Entry->Scalars[Lane];
6530 if (!isa<Instruction>(Scalar))
6533 auto It = ScalarToExtUses.
find(Scalar);
6534 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6538 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6539 if (ExtI != ExternallyUsedValues.
end()) {
6540 int FoundLane = Entry->findLaneForValue(Scalar);
6541 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6542 << FoundLane <<
" from " << *Scalar <<
".\n");
6543 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6544 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6547 for (
User *U : Scalar->users()) {
6555 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6559 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6563 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6565 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6566 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6568 assert(!UseEntry->isGather() &&
"Bad state");
6572 if (It != ScalarToExtUses.
end()) {
6573 ExternalUses[It->second].User =
nullptr;
6578 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6580 int FoundLane = Entry->findLaneForValue(Scalar);
6582 <<
" from lane " << FoundLane <<
" from " << *Scalar
6584 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6585 ExternalUses.emplace_back(Scalar, U, FoundLane);
6594BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6598 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6599 Value *V = TE->Scalars[Lane];
6601 if (!isa<Instruction>(V))
6608 for (
User *U : V->users()) {
6609 auto *SI = dyn_cast<StoreInst>(U);
6612 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6616 if (getTreeEntry(U))
6621 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6622 SI->getValueOperand()->getType(),
Ptr}];
6625 if (StoresVec.size() > Lane)
6627 if (!StoresVec.empty()) {
6629 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6630 SI->getValueOperand()->getType(),
6631 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6637 StoresVec.push_back(SI);
6642 for (
auto &
P : PtrToStoresMap) {
6643 Res[
I].swap(
P.second);
6650 OrdersType &ReorderIndices)
const {
6661 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6663 std::optional<int> Diff =
6665 SI->getPointerOperand(), *
DL, *SE,
6671 if (StoreOffsetVec.
size() != StoresVec.
size())
6673 sort(StoreOffsetVec,
6674 [](
const std::pair<int, unsigned> &L,
6675 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6678 for (
const auto &
P : StoreOffsetVec) {
6679 if (
Idx > 0 &&
P.first != PrevDist + 1)
6687 ReorderIndices.assign(StoresVec.
size(), 0);
6688 bool IsIdentity =
true;
6690 ReorderIndices[
P.second] =
I;
6691 IsIdentity &=
P.second ==
I;
6697 ReorderIndices.clear();
6704 for (
unsigned Idx : Order)
6711BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6712 unsigned NumLanes =
TE->Scalars.size();
6725 if (StoresVec.
size() != NumLanes)
6730 if (!canFormVector(StoresVec, ReorderIndices))
6735 ExternalReorderIndices.
push_back(ReorderIndices);
6737 return ExternalReorderIndices;
6743 UserIgnoreList = &UserIgnoreLst;
6746 buildTree_rec(Roots, 0,
EdgeInfo());
6753 buildTree_rec(Roots, 0,
EdgeInfo());
6762 bool AddNew =
true) {
6770 for (
Value *V : VL) {
6771 auto *LI = dyn_cast<LoadInst>(V);
6774 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6776 bool IsFound =
false;
6777 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6778 assert(LI->getParent() ==
Data.front().first->getParent() &&
6779 LI->getType() ==
Data.front().first->getType() &&
6783 "Expected loads with the same type, same parent and same "
6784 "underlying pointer.");
6786 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6787 Data.front().first->getPointerOperand(),
DL, SE,
6791 auto It = Map.find(*Dist);
6792 if (It != Map.end() && It->second != LI)
6794 if (It == Map.end()) {
6795 Data.emplace_back(LI, *Dist);
6796 Map.try_emplace(*Dist, LI);
6806 auto FindMatchingLoads =
6811 int &
Offset,
unsigned &Start) {
6813 return GatheredLoads.
end();
6823 std::optional<int> Dist =
6825 Data.front().first->getType(),
6826 Data.front().first->getPointerOperand(),
DL, SE,
6832 for (std::pair<LoadInst *, int>
P :
Data) {
6838 unsigned NumUniques = 0;
6839 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6840 bool Used = DataLoads.
contains(Pair.first);
6841 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6845 Repeated.insert(Cnt);
6848 if (NumUniques > 0 &&
6849 (Loads.
size() == NumUniques ||
6850 (Loads.
size() - NumUniques >= 2 &&
6851 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6857 return std::next(GatheredLoads.
begin(),
Idx);
6861 return GatheredLoads.
end();
6863 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6867 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6869 while (It != GatheredLoads.
end()) {
6870 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6871 for (
unsigned Idx : LocalToAdd)
6873 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6874 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6878 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6882 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6891 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6892 return PD.front().first->getParent() == LI->
getParent() &&
6893 PD.front().first->getType() == LI->
getType();
6895 while (It != GatheredLoads.
end()) {
6898 std::next(It), GatheredLoads.
end(),
6899 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6900 return PD.front().first->getParent() == LI->getParent() &&
6901 PD.front().first->getType() == LI->getType();
6905 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6906 AddNewLoads(GatheredLoads.emplace_back());
6911void BoUpSLP::tryToVectorizeGatheredLoads(
6914 8> &GatheredLoads) {
6915 GatheredLoadsEntriesFirst = VectorizableTree.size();
6918 LoadEntriesToVectorize.
size());
6919 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6920 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6921 VectorizableTree[
Idx]->Scalars.end());
6924 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6925 const std::pair<LoadInst *, int> &L2) {
6926 return L1.second > L2.second;
6932 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6933 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6941 bool Final,
unsigned MaxVF) {
6943 unsigned StartIdx = 0;
6948 *
TTI, Loads.
front()->getType(), MaxVF);
6950 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6956 if (Final && CandidateVFs.
empty())
6959 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6960 for (
unsigned NumElts : CandidateVFs) {
6961 if (Final && NumElts > BestVF)
6964 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6968 if (VectorizedLoads.count(Slice.
front()) ||
6969 VectorizedLoads.count(Slice.
back()) ||
6975 bool AllowToVectorize =
false;
6983 if (LI->hasOneUse())
6989 if (
static_cast<unsigned int>(std::distance(
6990 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6992 if (!IsLegalBroadcastLoad)
6996 for (
User *U : LI->users()) {
6997 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
6999 if (
const TreeEntry *UTE = getTreeEntry(U)) {
7000 for (
int I : seq<int>(UTE->getNumOperands())) {
7001 if (
all_of(UTE->getOperand(
I),
7002 [LI](
Value *V) { return V == LI; }))
7011 AllowToVectorize = CheckIfAllowed(Slice);
7015 any_of(ValueToGatherNodes.at(Slice.front()),
7016 [=](
const TreeEntry *TE) {
7017 return TE->Scalars.size() == 2 &&
7018 ((TE->Scalars.front() == Slice.front() &&
7019 TE->Scalars.back() == Slice.back()) ||
7020 (TE->Scalars.front() == Slice.back() &&
7021 TE->Scalars.back() == Slice.front()));
7026 if (AllowToVectorize) {
7031 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
7033 PointerOps, &BestVF);
7035 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7037 if (MaskedGatherVectorized.
empty() ||
7038 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7043 Results.emplace_back(Values, LS);
7044 VectorizedLoads.insert(Slice.begin(), Slice.end());
7047 if (Cnt == StartIdx)
7048 StartIdx += NumElts;
7051 if (StartIdx >= Loads.
size())
7055 if (!MaskedGatherVectorized.
empty() &&
7056 Cnt < MaskedGatherVectorized.
back() + NumElts)
7062 if (!AllowToVectorize || BestVF == 0)
7066 for (
unsigned Cnt : MaskedGatherVectorized) {
7068 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7072 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7074 if (Cnt == StartIdx)
7075 StartIdx += NumElts;
7079 if (!VectorizedLoads.contains(LI))
7080 NonVectorized.push_back(LI);
7084 auto ProcessGatheredLoads =
7087 bool Final =
false) {
7089 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7090 if (LoadsDists.size() <= 1) {
7091 NonVectorized.
push_back(LoadsDists.back().first);
7096 transform(LoadsDists, OriginalLoads.begin(),
7097 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7102 unsigned MaxConsecutiveDistance = 0;
7103 unsigned CurrentConsecutiveDist = 1;
7104 int LastDist = LocalLoadsDists.
front().second;
7105 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7106 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7107 if (getTreeEntry(
L.first))
7109 assert(LastDist >=
L.second &&
7110 "Expected first distance always not less than second");
7111 if (
static_cast<unsigned>(LastDist -
L.second) ==
7112 CurrentConsecutiveDist) {
7113 ++CurrentConsecutiveDist;
7114 MaxConsecutiveDistance =
7115 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7119 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7122 CurrentConsecutiveDist = 1;
7123 LastDist =
L.second;
7126 if (Loads.
size() <= 1)
7128 if (AllowMaskedGather)
7129 MaxConsecutiveDistance = Loads.
size();
7130 else if (MaxConsecutiveDistance < 2)
7135 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7136 Final, MaxConsecutiveDistance);
7138 OriginalLoads.size() == Loads.
size() &&
7139 MaxConsecutiveDistance == Loads.
size() &&
7144 VectorizedLoads.
clear();
7148 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7149 UnsortedNonVectorized, Final,
7150 OriginalLoads.size());
7151 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7152 SortedNonVectorized.
swap(UnsortedNonVectorized);
7153 Results.swap(UnsortedResults);
7158 << Slice.
size() <<
")\n");
7159 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7160 for (
Value *L : Slice)
7161 if (!getTreeEntry(L))
7162 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7168 unsigned MaxVF = Slice.size();
7169 unsigned UserMaxVF = 0;
7170 unsigned InterleaveFactor = 0;
7175 std::optional<unsigned> InterleavedLoadsDistance = 0;
7177 std::optional<unsigned> CommonVF = 0;
7181 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7182 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7185 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7187 if (*CommonVF == 0) {
7188 CommonVF =
E->Scalars.size();
7191 if (*CommonVF !=
E->Scalars.size())
7195 if (Pos !=
Idx && InterleavedLoadsDistance) {
7198 if (isa<Constant>(V))
7200 if (getTreeEntry(V))
7202 const auto &Nodes = ValueToGatherNodes.at(V);
7203 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7204 !is_contained(Slice, V);
7206 InterleavedLoadsDistance.reset();
7210 if (*InterleavedLoadsDistance == 0) {
7211 InterleavedLoadsDistance =
Idx - Pos;
7214 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7215 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7216 InterleavedLoadsDistance.reset();
7217 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7221 DeinterleavedNodes.
clear();
7223 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7224 CommonVF.value_or(0) != 0) {
7225 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7226 unsigned VF = *CommonVF;
7230 if (InterleaveFactor <= Slice.size() &&
7234 cast<LoadInst>(Slice.front())->getAlign(),
7235 cast<LoadInst>(Slice.front())
7239 UserMaxVF = InterleaveFactor * VF;
7241 InterleaveFactor = 0;
7246 unsigned ConsecutiveNodesSize = 0;
7247 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7248 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7249 [&, Slice = Slice](
const auto &
P) {
7251 return std::get<1>(
P).contains(V);
7253 if (It == Slice.end())
7256 VectorizableTree[std::get<0>(
P)]->Scalars;
7257 ConsecutiveNodesSize += VL.
size();
7258 unsigned Start = std::distance(Slice.begin(), It);
7259 unsigned Sz = Slice.size() - Start;
7260 return Sz < VL.
size() ||
7261 Slice.slice(std::distance(Slice.begin(), It),
7267 if (InterleaveFactor == 0 &&
7268 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7269 [&, Slice = Slice](
unsigned Idx) {
7271 SmallVector<Value *> PointerOps;
7272 return canVectorizeLoads(
7273 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7274 Slice[Idx * UserMaxVF], Order,
7276 LoadsState::ScatterVectorize;
7279 if (Slice.size() != ConsecutiveNodesSize)
7280 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7282 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7283 bool IsVectorized =
true;
7284 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7286 Slice.
slice(
I, std::min(VF,
E -
I));
7287 if (getTreeEntry(SubSlice.
front()))
7291 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7292 [&](
const auto &
P) {
7294 VectorizableTree[std::get<0>(
P)]
7299 unsigned Sz = VectorizableTree.size();
7300 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7301 if (Sz == VectorizableTree.size()) {
7302 IsVectorized =
false;
7305 if (InterleaveFactor > 0) {
7306 VF = 2 * (MaxVF / InterleaveFactor);
7307 InterleaveFactor = 0;
7316 NonVectorized.
append(SortedNonVectorized);
7318 return NonVectorized;
7320 for (
const auto &GLs : GatheredLoads) {
7321 const auto &
Ref = GLs.second;
7323 if (!
Ref.empty() && !NonVectorized.
empty() &&
7325 Ref.begin(),
Ref.end(), 0u,
7327 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7328 return S + LoadsDists.size();
7329 }) != NonVectorized.
size() &&
7330 IsMaskedGatherSupported(NonVectorized)) {
7332 for (
LoadInst *LI : NonVectorized) {
7340 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7344 for (
unsigned Idx : LoadEntriesToVectorize) {
7345 const TreeEntry &
E = *VectorizableTree[
Idx];
7348 if (!
E.ReorderIndices.empty()) {
7355 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7359 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7360 VectorizableTree.size())
7361 GatheredLoadsEntriesFirst.reset();
7368 Value *NeedsScheduling =
nullptr;
7369 for (
Value *V : VL) {
7372 if (!NeedsScheduling) {
7373 NeedsScheduling = V;
7378 return NeedsScheduling;
7389 bool AllowAlternate) {
7393 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7396 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7401 if (isa<ExtractElementInst, UndefValue>(V))
7403 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7405 !isa<UndefValue>(EI->getIndexOperand()))
7408 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7411 if ((isa<BinaryOperator, CastInst>(
I)) &&
7421 : cast<CastInst>(
I)->getOperand(0)->getType()));
7423 if (isa<CastInst>(
I)) {
7424 std::pair<size_t, size_t> OpVals =
7430 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7432 if (CI->isCommutative())
7438 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7452 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7453 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7454 SubKey =
hash_value(Gep->getPointerOperand());
7458 !isa<ConstantInt>(
I->getOperand(1))) {
7466 return std::make_pair(Key, SubKey);
7476bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7478 unsigned Opcode0 = S.getOpcode();
7479 unsigned Opcode1 = S.getAltOpcode();
7483 Opcode0, Opcode1, OpcodeMask))
7486 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7489 for (
Value *V : VL) {
7490 if (isa<PoisonValue>(V)) {
7495 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7500 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7506 switch (Res.value_or(0)) {
7521 constexpr unsigned NumAltInsts = 3;
7522 unsigned NonInstCnt = 0;
7525 unsigned UndefCnt = 0;
7527 unsigned ExtraShuffleInsts = 0;
7536 return is_contained(Operands.back(), V);
7539 ++ExtraShuffleInsts;
7556 if (isa<Constant, ExtractElementInst>(V) ||
7557 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7558 if (isa<UndefValue>(V))
7564 if (!Res.second && Res.first->second == 1)
7565 ++ExtraShuffleInsts;
7566 ++Res.first->getSecond();
7567 if (
auto *
I = dyn_cast<Instruction>(V))
7568 UniqueOpcodes.
insert(
I->getOpcode());
7569 else if (Res.second)
7572 return none_of(Uniques, [&](
const auto &
P) {
7573 return P.first->hasNUsesOrMore(
P.second + 1) &&
7575 return getTreeEntry(U) || Uniques.contains(U);
7584 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7585 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7586 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7589BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7591 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7594 "Expected instructions with same/alternate opcodes only.");
7596 unsigned ShuffleOrOp =
7597 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7599 switch (ShuffleOrOp) {
7600 case Instruction::PHI: {
7603 return TreeEntry::NeedToGather;
7605 for (
Value *V : VL) {
7606 auto *
PHI = dyn_cast<PHINode>(V);
7611 if (Term &&
Term->isTerminator()) {
7613 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7614 return TreeEntry::NeedToGather;
7619 return TreeEntry::Vectorize;
7621 case Instruction::ExtractValue:
7622 case Instruction::ExtractElement: {
7623 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7626 return TreeEntry::NeedToGather;
7627 if (Reuse || !CurrentOrder.empty())
7628 return TreeEntry::Vectorize;
7630 return TreeEntry::NeedToGather;
7632 case Instruction::InsertElement: {
7636 for (
Value *V : VL) {
7637 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7639 "Non-constant or undef index?");
7643 return !SourceVectors.contains(V);
7646 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7647 "different source vectors.\n");
7648 return TreeEntry::NeedToGather;
7653 return SourceVectors.contains(V) && !
V->hasOneUse();
7656 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7657 "multiple uses.\n");
7658 return TreeEntry::NeedToGather;
7661 return TreeEntry::Vectorize;
7663 case Instruction::Load: {
7672 return TreeEntry::Vectorize;
7674 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7676 LoadEntriesToVectorize.insert(VectorizableTree.size());
7677 return TreeEntry::NeedToGather;
7679 return TreeEntry::ScatterVectorize;
7681 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7683 LoadEntriesToVectorize.insert(VectorizableTree.size());
7684 return TreeEntry::NeedToGather;
7686 return TreeEntry::StridedVectorize;
7690 if (
DL->getTypeSizeInBits(ScalarTy) !=
7691 DL->getTypeAllocSizeInBits(ScalarTy))
7692 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7694 auto *LI = dyn_cast<LoadInst>(V);
7695 return !LI || !LI->isSimple();
7702 return TreeEntry::NeedToGather;
7706 case Instruction::ZExt:
7707 case Instruction::SExt:
7708 case Instruction::FPToUI:
7709 case Instruction::FPToSI:
7710 case Instruction::FPExt:
7711 case Instruction::PtrToInt:
7712 case Instruction::IntToPtr:
7713 case Instruction::SIToFP:
7714 case Instruction::UIToFP:
7715 case Instruction::Trunc:
7716 case Instruction::FPTrunc:
7717 case Instruction::BitCast: {
7719 for (
Value *V : VL) {
7720 if (isa<PoisonValue>(V))
7722 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7725 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7726 return TreeEntry::NeedToGather;
7729 return TreeEntry::Vectorize;
7731 case Instruction::ICmp:
7732 case Instruction::FCmp: {
7737 for (
Value *V : VL) {
7738 if (isa<PoisonValue>(V))
7740 auto *
Cmp = cast<CmpInst>(V);
7741 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7742 Cmp->getOperand(0)->getType() != ComparedTy) {
7743 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7744 return TreeEntry::NeedToGather;
7747 return TreeEntry::Vectorize;
7749 case Instruction::Select:
7750 case Instruction::FNeg:
7751 case Instruction::Add:
7752 case Instruction::FAdd:
7753 case Instruction::Sub:
7754 case Instruction::FSub:
7755 case Instruction::Mul:
7756 case Instruction::FMul:
7757 case Instruction::UDiv:
7758 case Instruction::SDiv:
7759 case Instruction::FDiv:
7760 case Instruction::URem:
7761 case Instruction::SRem:
7762 case Instruction::FRem:
7763 case Instruction::Shl:
7764 case Instruction::LShr:
7765 case Instruction::AShr:
7766 case Instruction::And:
7767 case Instruction::Or:
7768 case Instruction::Xor:
7769 case Instruction::Freeze:
7770 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7772 auto *
I = dyn_cast<Instruction>(V);
7773 return I &&
I->isBinaryOp() && !
I->isFast();
7775 return TreeEntry::NeedToGather;
7776 return TreeEntry::Vectorize;
7777 case Instruction::GetElementPtr: {
7779 for (
Value *V : VL) {
7780 auto *
I = dyn_cast<GetElementPtrInst>(V);
7783 if (
I->getNumOperands() != 2) {
7784 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7785 return TreeEntry::NeedToGather;
7791 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7792 for (
Value *V : VL) {
7793 auto *
GEP = dyn_cast<GEPOperator>(V);
7796 Type *CurTy =
GEP->getSourceElementType();
7798 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7799 return TreeEntry::NeedToGather;
7805 for (
Value *V : VL) {
7806 auto *
I = dyn_cast<GetElementPtrInst>(V);
7809 auto *
Op =
I->getOperand(1);
7810 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7811 (
Op->getType() != Ty1 &&
7812 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7813 Op->getType()->getScalarSizeInBits() >
7814 DL->getIndexSizeInBits(
7815 V->getType()->getPointerAddressSpace())))) {
7817 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7818 return TreeEntry::NeedToGather;
7822 return TreeEntry::Vectorize;
7824 case Instruction::Store: {
7826 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7829 if (
DL->getTypeSizeInBits(ScalarTy) !=
7830 DL->getTypeAllocSizeInBits(ScalarTy)) {
7831 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7832 return TreeEntry::NeedToGather;
7836 for (
Value *V : VL) {
7837 auto *
SI = cast<StoreInst>(V);
7838 if (!
SI->isSimple()) {
7840 return TreeEntry::NeedToGather;
7849 if (CurrentOrder.empty()) {
7850 Ptr0 = PointerOps.
front();
7851 PtrN = PointerOps.
back();
7853 Ptr0 = PointerOps[CurrentOrder.front()];
7854 PtrN = PointerOps[CurrentOrder.back()];
7856 std::optional<int> Dist =
7859 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7860 return TreeEntry::Vectorize;
7864 return TreeEntry::NeedToGather;
7866 case Instruction::Call: {
7867 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7869 auto *
I = dyn_cast<Instruction>(V);
7870 return I && !
I->isFast();
7872 return TreeEntry::NeedToGather;
7875 CallInst *CI = cast<CallInst>(VL0);
7886 return TreeEntry::NeedToGather;
7891 for (
unsigned J = 0; J != NumArgs; ++J)
7894 for (
Value *V : VL) {
7895 CallInst *CI2 = dyn_cast<CallInst>(V);
7901 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7903 return TreeEntry::NeedToGather;
7907 for (
unsigned J = 0; J != NumArgs; ++J) {
7910 if (ScalarArgs[J] != A1J) {
7912 <<
"SLP: mismatched arguments in call:" << *CI
7913 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7914 return TreeEntry::NeedToGather;
7923 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7924 <<
"!=" << *V <<
'\n');
7925 return TreeEntry::NeedToGather;
7929 return TreeEntry::Vectorize;
7931 case Instruction::ShuffleVector: {
7932 if (!S.isAltShuffle()) {
7935 return TreeEntry::Vectorize;
7938 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7939 return TreeEntry::NeedToGather;
7944 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7945 "the whole alt sequence is not profitable.\n");
7946 return TreeEntry::NeedToGather;
7949 return TreeEntry::Vectorize;
7953 return TreeEntry::NeedToGather;
7967 PHIHandler() =
delete;
7969 : DT(DT), Main(Main), Phis(Phis),
7970 Operands(Main->getNumIncomingValues(),
7972 void buildOperands() {
7973 constexpr unsigned FastLimit = 4;
7983 auto *
P = dyn_cast<PHINode>(V);
7985 assert(isa<PoisonValue>(V) &&
7986 "Expected isa instruction or poison value.");
7990 if (
P->getIncomingBlock(
I) == InBB)
8005 Blocks.try_emplace(InBB).first->second.push_back(
I);
8008 if (isa<PoisonValue>(V)) {
8013 auto *
P = cast<PHINode>(V);
8014 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
8022 auto It =
Blocks.find(InBB);
8028 for (
const auto &
P :
Blocks) {
8029 if (
P.getSecond().size() <= 1)
8031 unsigned BasicI =
P.getSecond().front();
8034 [&](
const auto &Data) {
8035 return !Data.value() ||
8036 Data.value() ==
Operands[BasicI][Data.index()];
8038 "Expected empty operands list.");
8048 const EdgeInfo &UserTreeIdx,
8049 unsigned InterleaveFactor) {
8055 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8056 bool DoNotFail =
false) {
8059 for (
Value *V : VL) {
8066 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8071 size_t NumUniqueScalarValues = UniqueValues.
size();
8074 if (NumUniqueScalarValues == VL.size() &&
8076 ReuseShuffleIndices.
clear();
8079 if ((UserTreeIdx.UserTE &&
8080 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8082 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8083 "for nodes with padding.\n");
8084 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8088 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8089 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8092 if (DoNotFail && UniquePositions.size() > 1 &&
8093 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8094 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8097 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8098 if (PWSz == VL.size()) {
8099 ReuseShuffleIndices.
clear();
8101 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8103 PWSz - UniqueValues.
size(),
8109 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8112 VL = NonUniqueValueVL;
8117 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8130 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8132 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8138 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8139 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp()
8141 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8142 auto It = MultiNodeScalars.
find(S.getMainOp());
8143 if (It != MultiNodeScalars.
end()) {
8144 auto *TEIt =
find_if(It->getSecond(),
8145 [&](TreeEntry *ME) { return ME->isSame(VL); });
8146 if (TEIt != It->getSecond().end())
8156 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
8157 if (TryToFindDuplicates(S))
8158 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8159 ReuseShuffleIndices);
8163 Nodes.
insert(getTreeEntry(S.getMainOp()));
8164 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8167 if (
any_of(Nodes, [&](
const TreeEntry *E) {
8169 [&](
Value *V) { return Values.contains(V); }))
8174 all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8177 if (TryToFindDuplicates(S))
8178 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8179 ReuseShuffleIndices);
8186 E->UserTreeIndices.push_back(UserTreeIdx);
8187 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8198 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8203 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8205 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8206 if (TryToFindDuplicates(S))
8207 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8208 ReuseShuffleIndices);
8213 if (S && S.getOpcode() == Instruction::ExtractElement &&
8214 isa<ScalableVectorType>(
8215 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8216 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8217 if (TryToFindDuplicates(S))
8218 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8219 ReuseShuffleIndices);
8226 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8235 auto &&NotProfitableForVectorization = [&S,
this,
8237 if (!S || !S.isAltShuffle() || VL.size() > 2)
8246 for (
Value *V : VL) {
8247 auto *
I = cast<Instruction>(V);
8249 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8252 bool IsCommutative =
8254 if ((IsCommutative &&
8255 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8257 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8259 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8261 auto *
I1 = cast<Instruction>(VL.front());
8262 auto *I2 = cast<Instruction>(VL.back());
8263 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8265 I2->getOperand(
Op));
8266 if (
static_cast<unsigned>(
count_if(
8267 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8269 })) >= S.getMainOp()->getNumOperands() / 2)
8271 if (S.getMainOp()->getNumOperands() > 2)
8273 if (IsCommutative) {
8278 I2->getOperand((
Op + 1) % E));
8280 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8289 bool IsScatterVectorizeUserTE =
8290 UserTreeIdx.UserTE &&
8291 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8293 bool AreScatterAllGEPSameBlock =
8294 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8298 auto *
I = dyn_cast<GetElementPtrInst>(V);
8302 BB =
I->getParent();
8303 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8306 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8308 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8311 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8314 NotProfitableForVectorization(VL)) {
8315 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8316 if (TryToFindDuplicates(S))
8317 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8318 ReuseShuffleIndices);
8323 if (S && !EphValues.
empty()) {
8324 for (
Value *V : VL) {
8325 if (EphValues.
count(V)) {
8327 <<
") is ephemeral.\n");
8328 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8338 for (
Value *V : VL) {
8339 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8342 if (getTreeEntry(V)) {
8344 <<
") is already in tree.\n");
8345 if (TryToFindDuplicates(S))
8346 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8347 ReuseShuffleIndices);
8353 if (UserIgnoreList && !UserIgnoreList->empty()) {
8354 for (
Value *V : VL) {
8355 if (UserIgnoreList->contains(V)) {
8356 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8357 if (TryToFindDuplicates(S))
8358 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8359 ReuseShuffleIndices);
8367 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8368 assert(VL.front()->getType()->isPointerTy() &&
8369 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8370 "Expected pointers only.");
8372 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8373 assert(It != VL.end() &&
"Expected at least one GEP.");
8390 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8395 if (!TryToFindDuplicates(S,
true))
8401 TreeEntry::EntryState State = getScalarsVectorizationState(
8402 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8403 if (State == TreeEntry::NeedToGather) {
8404 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8405 ReuseShuffleIndices);
8409 auto &BSRef = BlocksSchedules[BB];
8411 BSRef = std::make_unique<BlockScheduling>(BB);
8413 BlockScheduling &BS = *BSRef;
8415 std::optional<ScheduleData *> Bundle =
8416 BS.tryScheduleBundle(UniqueValues,
this, S);
8417#ifdef EXPENSIVE_CHECKS
8422 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8423 assert((!BS.getScheduleData(VL0) ||
8424 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8425 "tryScheduleBundle should cancelScheduling on failure");
8426 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8427 ReuseShuffleIndices);
8428 NonScheduledFirst.insert(VL.front());
8429 if (S.getOpcode() == Instruction::Load &&
8430 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8434 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8436 unsigned ShuffleOrOp =
8437 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8438 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8441 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8446 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8451 for (
unsigned I : PHIOps)
8454 switch (ShuffleOrOp) {
8455 case Instruction::PHI: {
8456 auto *PH = cast<PHINode>(VL0);
8459 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8464 PHIHandler Handler(*DT, PH, VL);
8465 Handler.buildOperands();
8466 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8467 TE->setOperand(
I, Handler.getOperands(
I));
8469 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8474 case Instruction::ExtractValue:
8475 case Instruction::ExtractElement: {
8476 if (CurrentOrder.empty()) {
8477 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8480 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8482 for (
unsigned Idx : CurrentOrder)
8490 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8491 ReuseShuffleIndices, CurrentOrder);
8493 "(ExtractValueInst/ExtractElementInst).\n";
8497 TE->setOperand(*
this);
8500 case Instruction::InsertElement: {
8501 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8503 auto OrdCompare = [](
const std::pair<int, int> &P1,
8504 const std::pair<int, int> &P2) {
8505 return P1.first > P2.first;
8508 decltype(OrdCompare)>
8509 Indices(OrdCompare);
8510 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8512 Indices.emplace(
Idx,
I);
8514 OrdersType CurrentOrder(VL.size(), VL.size());
8515 bool IsIdentity =
true;
8516 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8517 CurrentOrder[Indices.top().second] =
I;
8518 IsIdentity &= Indices.top().second ==
I;
8522 CurrentOrder.clear();
8523 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8525 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8528 TE->setOperand(*
this);
8529 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8532 case Instruction::Load: {
8539 TreeEntry *
TE =
nullptr;
8542 case TreeEntry::Vectorize:
8543 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8544 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8545 if (CurrentOrder.empty())
8550 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8553 case TreeEntry::StridedVectorize:
8555 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8556 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8557 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8560 case TreeEntry::ScatterVectorize:
8562 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8563 UserTreeIdx, ReuseShuffleIndices);
8566 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8569 case TreeEntry::CombinedVectorize:
8570 case TreeEntry::NeedToGather:
8573 TE->setOperand(*
this);
8574 if (State == TreeEntry::ScatterVectorize)
8575 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8578 case Instruction::ZExt:
8579 case Instruction::SExt:
8580 case Instruction::FPToUI:
8581 case Instruction::FPToSI:
8582 case Instruction::FPExt:
8583 case Instruction::PtrToInt:
8584 case Instruction::IntToPtr:
8585 case Instruction::SIToFP:
8586 case Instruction::UIToFP:
8587 case Instruction::Trunc:
8588 case Instruction::FPTrunc:
8589 case Instruction::BitCast: {
8590 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8591 std::make_pair(std::numeric_limits<unsigned>::min(),
8592 std::numeric_limits<unsigned>::max()));
8593 if (ShuffleOrOp == Instruction::ZExt ||
8594 ShuffleOrOp == Instruction::SExt) {
8595 CastMaxMinBWSizes = std::make_pair(
8601 }
else if (ShuffleOrOp == Instruction::Trunc) {
8602 CastMaxMinBWSizes = std::make_pair(
8609 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8610 ReuseShuffleIndices);
8614 TE->setOperand(*
this);
8616 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8617 if (ShuffleOrOp == Instruction::Trunc) {
8618 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8619 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8620 ShuffleOrOp == Instruction::UIToFP) {
8621 unsigned NumSignBits =
8623 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8625 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8627 if (NumSignBits * 2 >=
8629 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8633 case Instruction::ICmp:
8634 case Instruction::FCmp: {
8637 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8638 ReuseShuffleIndices);
8643 VLOperands Ops(VL, VL0, *
this);
8648 "Commutative Predicate mismatch");
8650 Left = Ops.getVL(0);
8651 Right = Ops.getVL(1);
8654 for (
Value *V : VL) {
8655 if (isa<PoisonValue>(V)) {
8660 auto *
Cmp = cast<CmpInst>(V);
8663 if (
Cmp->getPredicate() != P0)
8665 Left.push_back(LHS);
8666 Right.push_back(RHS);
8673 if (ShuffleOrOp == Instruction::ICmp) {
8674 unsigned NumSignBits0 =
8676 if (NumSignBits0 * 2 >=
8678 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8679 unsigned NumSignBits1 =
8681 if (NumSignBits1 * 2 >=
8683 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8687 case Instruction::Select:
8688 case Instruction::FNeg:
8689 case Instruction::Add:
8690 case Instruction::FAdd:
8691 case Instruction::Sub:
8692 case Instruction::FSub:
8693 case Instruction::Mul:
8694 case Instruction::FMul:
8695 case Instruction::UDiv:
8696 case Instruction::SDiv:
8697 case Instruction::FDiv:
8698 case Instruction::URem:
8699 case Instruction::SRem:
8700 case Instruction::FRem:
8701 case Instruction::Shl:
8702 case Instruction::LShr:
8703 case Instruction::AShr:
8704 case Instruction::And:
8705 case Instruction::Or:
8706 case Instruction::Xor:
8707 case Instruction::Freeze: {
8708 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8709 ReuseShuffleIndices);
8711 dbgs() <<
"SLP: added a new TreeEntry "
8712 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8715 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8717 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8720 case Instruction::GetElementPtr: {
8721 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8722 ReuseShuffleIndices);
8723 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8727 for (
Value *V : VL) {
8728 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8733 Operands.front().push_back(
GEP->getPointerOperand());
8744 [VL0Ty, IndexIdx](
Value *V) {
8745 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8748 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8752 ->getPointerOperandType()
8755 for (
Value *V : VL) {
8756 auto *
I = dyn_cast<GetElementPtrInst>(V);
8759 ConstantInt::get(Ty, 0,
false));
8762 auto *
Op =
I->getOperand(IndexIdx);
8763 auto *CI = dyn_cast<ConstantInt>(
Op);
8768 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8772 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8776 case Instruction::Store: {
8777 bool Consecutive = CurrentOrder.empty();
8780 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8781 ReuseShuffleIndices, CurrentOrder);
8783 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8787 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8789 TE->setOperand(*
this);
8790 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8793 case Instruction::Call: {
8796 CallInst *CI = cast<CallInst>(VL0);
8799 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8800 ReuseShuffleIndices);
8804 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8809 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8813 case Instruction::ShuffleVector: {
8814 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8815 ReuseShuffleIndices);
8816 if (S.isAltShuffle()) {
8817 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8822 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8827 auto *CI = dyn_cast<CmpInst>(VL0);
8829 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8831 auto *MainCI = cast<CmpInst>(S.getMainOp());
8832 auto *AltCI = cast<CmpInst>(S.getAltOp());
8836 "Expected different main/alternate predicates.");
8840 for (
Value *V : VL) {
8841 if (isa<PoisonValue>(V)) {
8846 auto *
Cmp = cast<CmpInst>(V);
8857 Left.push_back(LHS);
8858 Right.push_back(RHS);
8867 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8869 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8882 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8885 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8887 for (
const auto *Ty : ST->elements())
8888 if (Ty != *ST->element_begin())
8890 N *= ST->getNumElements();
8891 EltTy = *ST->element_begin();
8892 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8893 N *= AT->getNumElements();
8894 EltTy = AT->getElementType();
8896 auto *VT = cast<FixedVectorType>(EltTy);
8897 N *= VT->getNumElements();
8898 EltTy = VT->getElementType();
8905 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8913 bool ResizeAllowed)
const {
8914 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8915 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8916 auto *E0 = cast<Instruction>(*It);
8918 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8922 Value *Vec = E0->getOperand(0);
8924 CurrentOrder.
clear();
8928 if (E0->getOpcode() == Instruction::ExtractValue) {
8933 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8937 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8940 unsigned E = VL.
size();
8941 if (!ResizeAllowed && NElts != E)
8944 unsigned MinIdx = NElts, MaxIdx = 0;
8946 auto *Inst = dyn_cast<Instruction>(V);
8949 if (Inst->getOperand(0) != Vec)
8951 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8952 if (isa<UndefValue>(EE->getIndexOperand()))
8957 const unsigned ExtIdx = *
Idx;
8958 if (ExtIdx >= NElts)
8960 Indices[
I] = ExtIdx;
8961 if (MinIdx > ExtIdx)
8963 if (MaxIdx < ExtIdx)
8966 if (MaxIdx - MinIdx + 1 > E)
8968 if (MaxIdx + 1 <= E)
8972 bool ShouldKeepOrder =
true;
8978 CurrentOrder.
assign(E, E);
8979 for (
unsigned I = 0;
I < E; ++
I) {
8982 const unsigned ExtIdx = Indices[
I] - MinIdx;
8983 if (CurrentOrder[ExtIdx] != E) {
8984 CurrentOrder.
clear();
8987 ShouldKeepOrder &= ExtIdx ==
I;
8988 CurrentOrder[ExtIdx] =
I;
8990 if (ShouldKeepOrder)
8991 CurrentOrder.
clear();
8993 return ShouldKeepOrder;
8996bool BoUpSLP::areAllUsersVectorized(
8998 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
9000 return ScalarToTreeEntry.contains(U) ||
9001 isVectorLikeInstWithConstOps(U) ||
9002 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9006static std::pair<InstructionCost, InstructionCost>
9014 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9015 FMF = FPCI->getFastMathFlags();
9018 dyn_cast<IntrinsicInst>(CI));
9019 auto IntrinsicCost =
9026 auto LibCost = IntrinsicCost;
9033 return {IntrinsicCost, LibCost};
9036void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9040 unsigned Sz = Scalars.size();
9043 if (!ReorderIndices.empty())
9045 for (
unsigned I = 0;
I < Sz; ++
I) {
9047 if (!ReorderIndices.empty())
9049 if (isa<PoisonValue>(Scalars[
Idx]))
9051 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9052 if (IsAltOp(OpInst)) {
9062 if (!ReuseShuffleIndices.
empty()) {
9065 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9075 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9076 auto *AltCI = cast<CmpInst>(AltOp);
9079 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9080 auto *CI = cast<CmpInst>(
I);
9088 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9089 "CmpInst expected to match either main or alternate predicate or "
9092 return MainP !=
P && MainP != SwappedP;
9099 const auto *Op0 = Ops.
front();
9105 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9109 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9111 if (
auto *CI = dyn_cast<ConstantInt>(V))
9112 return CI->getValue().isPowerOf2();
9115 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9117 if (
auto *CI = dyn_cast<ConstantInt>(V))
9118 return CI->getValue().isNegatedPowerOf2();
9123 if (IsConstant && IsUniform)
9125 else if (IsConstant)
9139class BaseShuffleAnalysis {
9141 Type *ScalarTy =
nullptr;
9143 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9151 unsigned getVF(
Value *V)
const {
9152 assert(V &&
"V cannot be nullptr");
9153 assert(isa<FixedVectorType>(
V->getType()) &&
9154 "V does not have FixedVectorType");
9155 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9157 unsigned VNumElements =
9158 cast<FixedVectorType>(
V->getType())->getNumElements();
9159 assert(VNumElements > ScalarTyNumElements &&
9160 "the number of elements of V is not large enough");
9161 assert(VNumElements % ScalarTyNumElements == 0 &&
9162 "the number of elements of V is not a vectorized value");
9163 return VNumElements / ScalarTyNumElements;
9171 int Limit =
Mask.size();
9183 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9199 unsigned VF =
Mask.size();
9201 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9204 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9245 bool SinglePermute) {
9249 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9251 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9257 if (isIdentityMask(Mask, SVTy,
false)) {
9258 if (!IdentityOp || !SinglePermute ||
9259 (isIdentityMask(Mask, SVTy,
true) &&
9261 IdentityMask.
size()))) {
9266 IdentityMask.
assign(Mask);
9286 if (SV->isZeroEltSplat()) {
9288 IdentityMask.
assign(Mask);
9290 int LocalVF =
Mask.size();
9292 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9293 LocalVF = SVOpTy->getNumElements();
9297 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9299 ExtMask[
Idx] = SV->getMaskValue(
I);
9309 if (!IsOp1Undef && !IsOp2Undef) {
9311 for (
int &
I : Mask) {
9314 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9321 combineMasks(LocalVF, ShuffleMask, Mask);
9322 Mask.swap(ShuffleMask);
9324 Op = SV->getOperand(0);
9326 Op = SV->getOperand(1);
9328 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9329 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9334 "Expected masks of same sizes.");
9339 Mask.swap(IdentityMask);
9340 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9341 return SinglePermute &&
9342 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9344 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9345 Shuffle->isZeroEltSplat() &&
9358 template <
typename T,
typename ShuffleBuilderTy>
9360 ShuffleBuilderTy &Builder) {
9361 assert(V1 &&
"Expected at least one vector value.");
9363 Builder.resizeToMatch(V1, V2);
9364 int VF =
Mask.size();
9365 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9366 VF = FTy->getNumElements();
9367 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9374 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9377 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9379 CombinedMask1[
I] =
Mask[
I];
9381 CombinedMask2[
I] =
Mask[
I] - VF;
9388 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9389 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9392 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9393 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9398 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9401 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9403 ExtMask1, UseMask::SecondArg);
9408 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9411 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9413 ExtMask2, UseMask::SecondArg);
9414 if (SV1->getOperand(0)->getType() ==
9415 SV2->getOperand(0)->getType() &&
9416 SV1->getOperand(0)->getType() != SV1->getType() &&
9419 Op1 = SV1->getOperand(0);
9420 Op2 = SV2->getOperand(0);
9422 int LocalVF = ShuffleMask1.size();
9423 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9424 LocalVF = FTy->getNumElements();
9425 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9426 CombinedMask1.swap(ShuffleMask1);
9428 LocalVF = ShuffleMask2.size();
9429 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9430 LocalVF = FTy->getNumElements();
9431 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9432 CombinedMask2.swap(ShuffleMask2);
9435 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9436 Builder.resizeToMatch(Op1, Op2);
9437 VF = std::max(cast<VectorType>(Op1->
getType())
9439 .getKnownMinValue(),
9440 cast<VectorType>(Op2->
getType())
9442 .getKnownMinValue());
9443 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9446 "Expected undefined mask element");
9447 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9453 isa<ShuffleVectorInst>(Op1) &&
9454 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9456 return Builder.createIdentity(Op1);
9457 return Builder.createShuffleVector(
9461 if (isa<PoisonValue>(V1))
9462 return Builder.createPoison(
9463 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9465 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9466 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9469 return Builder.createShuffleVector(V1, NewMask);
9470 return Builder.createIdentity(V1);
9476static std::pair<InstructionCost, InstructionCost>
9487 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9497 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9501 for (
Value *V : Ptrs) {
9506 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9511 if (!
Ptr || !
Ptr->hasOneUse())
9515 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9521 TTI::PointersChainInfo::getKnownStride(),
9531 [](
const Value *V) {
9532 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9533 return Ptr && !
Ptr->hasAllConstantIndices();
9535 ? TTI::PointersChainInfo::getUnknownStride()
9536 : TTI::PointersChainInfo::getKnownStride();
9540 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9542 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9543 if (It != Ptrs.
end())
9544 BaseGEP = cast<GEPOperator>(*It);
9549 BaseGEP->getPointerOperand(), Indices, VecTy,
9554 return std::make_pair(ScalarCost, VecCost);
9557void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9558 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9559 "Expected gather node without reordering.");
9565 if (
TE.Scalars.size() == 2 || (
TE.getOpcode() && !
TE.isAltShuffle()) ||
9569 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9570 return VectorizableTree[Idx]->isSame(TE.Scalars);
9574 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9579 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9580 if (LIt != LoadsMap.
end()) {
9581 for (
LoadInst *RLI : LIt->second) {
9587 for (
LoadInst *RLI : LIt->second) {
9594 if (LIt->second.size() > 2) {
9596 hash_value(LIt->second.back()->getPointerOperand());
9602 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9607 bool IsOrdered =
true;
9608 unsigned NumInstructions = 0;
9613 if (
auto *Inst = dyn_cast<Instruction>(V);
9614 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9620 auto &Container = SortedValues[
Key];
9621 if (IsOrdered && !KeyToIndex.
contains(V) &&
9622 !(isa<Constant, ExtractElementInst>(V) ||
9624 ((Container.contains(
Idx) &&
9625 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9626 (!Container.empty() && !Container.contains(
Idx) &&
9627 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9629 auto &KTI = KeyToIndex[
V];
9631 Container[
Idx].push_back(V);
9636 if (!IsOrdered && NumInstructions > 1) {
9638 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9639 for (
const auto &
D : SortedValues) {
9640 for (
const auto &
P :
D.second) {
9642 for (
Value *V :
P.second) {
9645 TE.ReorderIndices[Cnt +
K] =
Idx;
9646 TE.Scalars[Cnt +
K] =
V;
9648 Sz += Indices.
size();
9649 Cnt += Indices.
size();
9651 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9653 *
TTI,
TE.Scalars.front()->getType(), Sz);
9655 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9657 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9658 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9665 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9670 auto *ScalarTy =
TE.Scalars.front()->getType();
9672 for (
auto [
Idx, Sz] : SubVectors) {
9676 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9681 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9682 if (DemandedElts[
I])
9685 CostKind,
I * ScalarTyNumElements, FTy);
9690 int Sz =
TE.Scalars.size();
9692 TE.ReorderIndices.end());
9693 for (
unsigned I : seq<unsigned>(Sz)) {
9695 if (isa<PoisonValue>(V)) {
9698 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9702 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9705 VecTy, ReorderMask);
9708 for (
unsigned I : seq<unsigned>(Sz)) {
9712 if (!isa<PoisonValue>(V))
9715 ReorderMask[
I] =
I + Sz;
9719 VecTy, DemandedElts,
true,
false,
CostKind);
9722 if (
Cost >= BVCost) {
9725 TE.ReorderIndices.clear();
9731 BaseGraphSize = VectorizableTree.size();
9733 class GraphTransformModeRAAI {
9734 bool &SavedIsGraphTransformMode;
9737 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9738 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9739 IsGraphTransformMode =
true;
9741 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9742 } TransformContext(IsGraphTransformMode);
9751 const InstructionsState &S) {
9753 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9755 I2->getOperand(
Op));
9757 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9759 [](
const std::pair<Value *, Value *> &
P) {
9760 return isa<Constant>(
P.first) ||
9761 isa<Constant>(
P.second) ||
P.first ==
P.second;
9768 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9769 TreeEntry &E = *VectorizableTree[
Idx];
9771 reorderGatherNode(E);
9775 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9776 TreeEntry &E = *VectorizableTree[
Idx];
9783 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9784 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9790 unsigned StartIdx = 0;
9795 *
TTI, VL.
front()->getType(), VF - 1)) {
9796 if (StartIdx + VF >
End)
9799 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9803 if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9804 SE || getTreeEntry(Slice.
back())) {
9807 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9815 bool IsSplat =
isSplat(Slice);
9816 if (Slices.
empty() || !IsSplat ||
9818 Slice.
front()->getType(), VF)),
9821 Slice.
front()->getType(), 2 * VF)),
9824 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9830 (S.getOpcode() == Instruction::Load &&
9837 if ((!UserIgnoreList || E.Idx != 0) &&
9841 if (isa<PoisonValue>(V))
9843 return areAllUsersVectorized(cast<Instruction>(V),
9847 if (S.getOpcode() == Instruction::Load) {
9859 if (UserIgnoreList && E.Idx == 0)
9864 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9867 !CheckOperandsProfitability(
9870 IsaPred<Instruction>)),
9881 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9882 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9883 if (StartIdx == Cnt)
9884 StartIdx = Cnt + Sz;
9885 if (
End == Cnt + Sz)
9888 for (
auto [Cnt, Sz] : Slices) {
9891 if (TreeEntry *SE = getTreeEntry(Slice.
front());
9892 SE || getTreeEntry(Slice.
back())) {
9895 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9897 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9898 AddCombinedNode(SE->Idx, Cnt, Sz);
9901 unsigned PrevSize = VectorizableTree.size();
9902 [[maybe_unused]]
unsigned PrevEntriesSize =
9903 LoadEntriesToVectorize.size();
9904 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9905 if (PrevSize + 1 == VectorizableTree.size() &&
9906 VectorizableTree[PrevSize]->isGather() &&
9907 VectorizableTree[PrevSize]->getOpcode() !=
9908 Instruction::ExtractElement &&
9910 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9912 VectorizableTree.pop_back();
9913 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9914 "LoadEntriesToVectorize expected to remain the same");
9917 AddCombinedNode(PrevSize, Cnt, Sz);
9921 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9924 E.ReorderIndices.clear();
9927 switch (E.getOpcode()) {
9928 case Instruction::Load: {
9931 if (E.State != TreeEntry::Vectorize)
9933 Type *ScalarTy = E.getMainOp()->getType();
9935 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9938 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9942 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9949 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9950 false, CommonAlignment,
CostKind, BaseLI);
9951 if (StridedCost < OriginalVecCost)
9954 E.State = TreeEntry::StridedVectorize;
9958 case Instruction::Store: {
9960 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9962 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9965 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9969 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9976 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9977 false, CommonAlignment,
CostKind, BaseSI);
9978 if (StridedCost < OriginalVecCost)
9981 E.State = TreeEntry::StridedVectorize;
9982 }
else if (!E.ReorderIndices.empty()) {
9985 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9986 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
9987 if (Mask.size() < 4)
9989 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9993 VecTy, Factor, BaseSI->getAlign(),
10001 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10002 if (InterleaveFactor != 0)
10003 E.setInterleave(InterleaveFactor);
10007 case Instruction::Select: {
10008 if (E.State != TreeEntry::Vectorize)
10014 E.CombinedOp = TreeEntry::MinMax;
10015 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
10016 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10017 CondEntry->State == TreeEntry::Vectorize) {
10019 CondEntry->State = TreeEntry::CombinedVectorize;
10028 if (LoadEntriesToVectorize.empty()) {
10030 if (VectorizableTree.size() <= 1 &&
10031 VectorizableTree.front()->getOpcode() == Instruction::Load)
10034 constexpr unsigned SmallTree = 3;
10035 constexpr unsigned SmallVF = 2;
10036 if ((VectorizableTree.size() <= SmallTree &&
10037 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10038 (VectorizableTree.size() <= 2 && UserIgnoreList))
10041 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10045 [](
const std::unique_ptr<TreeEntry> &TE) {
10046 return TE->isGather() &&
10047 TE->getOpcode() == Instruction::Load &&
10059 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10060 TreeEntry &E = *TE;
10061 if (E.isGather() &&
10062 (E.getOpcode() == Instruction::Load ||
10063 (!E.getOpcode() &&
any_of(E.Scalars,
10065 return isa<LoadInst>(V) &&
10066 !isVectorized(V) &&
10067 !isDeleted(cast<Instruction>(V));
10070 for (
Value *V : E.Scalars) {
10071 auto *LI = dyn_cast<LoadInst>(V);
10077 *
this, V, *DL, *SE, *
TTI,
10078 GatheredLoads[std::make_tuple(
10086 if (!GatheredLoads.
empty())
10087 tryToVectorizeGatheredLoads(GatheredLoads);
10097 bool IsFinalized =
false;
10110 bool SameNodesEstimated =
true;
10119 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10135 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10136 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10139 count(VL, *It) > 1 &&
10141 if (!NeedShuffle) {
10142 if (isa<FixedVectorType>(ScalarTy)) {
10147 cast<FixedVectorType>(ScalarTy));
10150 CostKind, std::distance(VL.
begin(), It),
10156 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10163 VecTy, ShuffleMask, CostKind,
10167 return GatherCost +
10168 (
all_of(Gathers, IsaPred<UndefValue>)
10170 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10178 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10179 unsigned NumParts) {
10180 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10182 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10183 auto *EE = dyn_cast<ExtractElementInst>(V);
10186 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10189 return std::max(Sz, VecTy->getNumElements());
10195 -> std::optional<TTI::ShuffleKind> {
10196 if (NumElts <= EltsPerVector)
10197 return std::nullopt;
10199 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10201 if (I == PoisonMaskElem)
10203 return std::min(S, I);
10206 int OffsetReg1 = OffsetReg0;
10210 int FirstRegId = -1;
10211 Indices.assign(1, OffsetReg0);
10215 int Idx =
I - OffsetReg0;
10217 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10218 if (FirstRegId < 0)
10219 FirstRegId = RegId;
10220 RegIndices.
insert(RegId);
10221 if (RegIndices.
size() > 2)
10222 return std::nullopt;
10223 if (RegIndices.
size() == 2) {
10225 if (Indices.
size() == 1) {
10228 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10229 [&](
int S,
int I) {
10230 if (I == PoisonMaskElem)
10232 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10233 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10234 if (RegId == FirstRegId)
10236 return std::min(S, I);
10239 Indices.push_back(OffsetReg1 % NumElts);
10241 Idx =
I - OffsetReg1;
10243 I = (
Idx % NumElts) % EltsPerVector +
10244 (RegId == FirstRegId ? 0 : EltsPerVector);
10246 return ShuffleKind;
10253 for (
unsigned Part : seq<unsigned>(NumParts)) {
10254 if (!ShuffleKinds[Part])
10257 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10261 std::optional<TTI::ShuffleKind> RegShuffleKind =
10262 CheckPerRegistersShuffle(SubMask, Indices);
10263 if (!RegShuffleKind) {
10266 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10279 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10280 for (
unsigned Idx : Indices) {
10281 assert((
Idx + EltsPerVector) <= BaseVF &&
10282 "SK_ExtractSubvector index out of range");
10293 if (OriginalCost <
Cost)
10294 Cost = OriginalCost;
10302 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10309 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10311 unsigned SliceSize) {
10312 if (SameNodesEstimated) {
10318 if ((InVectors.
size() == 2 &&
10319 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10320 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10321 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10322 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10325 "Expected all poisoned elements.");
10327 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10332 Cost += createShuffle(InVectors.
front(),
10333 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10335 transformMaskAfterShuffle(CommonMask, CommonMask);
10336 }
else if (InVectors.
size() == 2) {
10337 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10338 transformMaskAfterShuffle(CommonMask, CommonMask);
10340 SameNodesEstimated =
false;
10341 if (!E2 && InVectors.
size() == 1) {
10342 unsigned VF = E1.getVectorFactor();
10345 cast<FixedVectorType>(V1->
getType())->getNumElements());
10347 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10348 VF = std::max(VF, E->getVectorFactor());
10350 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10352 CommonMask[
Idx] = Mask[
Idx] + VF;
10353 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10354 transformMaskAfterShuffle(CommonMask, CommonMask);
10356 auto P = InVectors.
front();
10357 Cost += createShuffle(&E1, E2, Mask);
10358 unsigned VF = Mask.size();
10363 const auto *E = cast<const TreeEntry *>(
P);
10364 VF = std::max(VF, E->getVectorFactor());
10366 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10368 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10369 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10370 transformMaskAfterShuffle(CommonMask, CommonMask);
10374 class ShuffleCostBuilder {
10377 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10379 return Mask.empty() ||
10380 (VF == Mask.size() &&
10388 ~ShuffleCostBuilder() =
default;
10393 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10394 if (isEmptyOrIdentity(Mask, VF))
10397 cast<VectorType>(V1->
getType()), Mask);
10402 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10403 if (isEmptyOrIdentity(Mask, VF))
10406 cast<VectorType>(V1->
getType()), Mask);
10412 void resizeToMatch(
Value *&,
Value *&)
const {}
10422 ShuffleCostBuilder Builder(
TTI);
10425 unsigned CommonVF = Mask.size();
10427 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10431 Type *EScalarTy = E.Scalars.front()->getType();
10432 bool IsSigned =
true;
10433 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10435 IsSigned = It->second.second;
10437 if (EScalarTy != ScalarTy) {
10438 unsigned CastOpcode = Instruction::Trunc;
10439 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10440 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10442 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10450 if (isa<Constant>(V))
10452 auto *VecTy = cast<VectorType>(V->getType());
10454 if (EScalarTy != ScalarTy) {
10456 unsigned CastOpcode = Instruction::Trunc;
10457 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10458 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10460 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10467 if (!V1 && !V2 && !P2.
isNull()) {
10469 const TreeEntry *E = cast<const TreeEntry *>(P1);
10470 unsigned VF = E->getVectorFactor();
10471 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10472 CommonVF = std::max(VF, E2->getVectorFactor());
10475 return Idx < 2 * static_cast<int>(CommonVF);
10477 "All elements in mask must be less than 2 * CommonVF.");
10478 if (E->Scalars.size() == E2->Scalars.size()) {
10482 for (
int &
Idx : CommonMask) {
10485 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10487 else if (
Idx >=
static_cast<int>(CommonVF))
10488 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10492 CommonVF = E->Scalars.size();
10493 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10494 GetNodeMinBWAffectedCost(*E2, CommonVF);
10496 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10497 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10500 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10501 }
else if (!V1 && P2.
isNull()) {
10503 const TreeEntry *E = cast<const TreeEntry *>(P1);
10504 unsigned VF = E->getVectorFactor();
10508 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10509 "All elements in mask must be less than CommonVF.");
10510 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10512 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10513 for (
int &
Idx : CommonMask) {
10517 CommonVF = E->Scalars.size();
10518 }
else if (
unsigned Factor = E->getInterleaveFactor();
10519 Factor > 0 && E->Scalars.size() != Mask.size() &&
10523 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10525 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10528 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10529 CommonVF == CommonMask.
size() &&
10531 [](
const auto &&
P) {
10533 static_cast<unsigned>(
P.value()) !=
P.index();
10541 }
else if (V1 && P2.
isNull()) {
10543 ExtraCost += GetValueMinBWAffectedCost(V1);
10544 CommonVF = getVF(V1);
10547 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10548 "All elements in mask must be less than CommonVF.");
10549 }
else if (V1 && !V2) {
10551 unsigned VF = getVF(V1);
10552 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10553 CommonVF = std::max(VF, E2->getVectorFactor());
10556 return Idx < 2 * static_cast<int>(CommonVF);
10558 "All elements in mask must be less than 2 * CommonVF.");
10559 if (E2->Scalars.size() == VF && VF != CommonVF) {
10561 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10562 for (
int &
Idx : CommonMask) {
10565 if (
Idx >=
static_cast<int>(CommonVF))
10566 Idx = E2Mask[
Idx - CommonVF] + VF;
10570 ExtraCost += GetValueMinBWAffectedCost(V1);
10572 ExtraCost += GetNodeMinBWAffectedCost(
10573 *E2, std::min(CommonVF, E2->getVectorFactor()));
10574 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10575 }
else if (!V1 && V2) {
10577 unsigned VF = getVF(V2);
10578 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10579 CommonVF = std::max(VF, E1->getVectorFactor());
10582 return Idx < 2 * static_cast<int>(CommonVF);
10584 "All elements in mask must be less than 2 * CommonVF.");
10585 if (E1->Scalars.size() == VF && VF != CommonVF) {
10587 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10588 for (
int &
Idx : CommonMask) {
10591 if (
Idx >=
static_cast<int>(CommonVF))
10592 Idx = E1Mask[
Idx - CommonVF] + VF;
10598 ExtraCost += GetNodeMinBWAffectedCost(
10599 *E1, std::min(CommonVF, E1->getVectorFactor()));
10601 ExtraCost += GetValueMinBWAffectedCost(V2);
10602 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10604 assert(V1 && V2 &&
"Expected both vectors.");
10605 unsigned VF = getVF(V1);
10606 CommonVF = std::max(VF, getVF(V2));
10609 return Idx < 2 * static_cast<int>(CommonVF);
10611 "All elements in mask must be less than 2 * CommonVF.");
10613 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10614 if (V1->
getType() != V2->getType()) {
10616 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10618 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10620 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10621 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10624 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10629 InVectors.
front() =
10631 if (InVectors.
size() == 2)
10633 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10634 V1, V2, CommonMask, Builder);
10641 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10642 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10643 CheckedExtracts(CheckedExtracts) {}
10645 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10646 unsigned NumParts,
bool &UseVecBaseAsInput) {
10647 UseVecBaseAsInput =
false;
10650 Value *VecBase =
nullptr;
10652 if (!E->ReorderIndices.empty()) {
10654 E->ReorderIndices.end());
10659 bool PrevNodeFound =
any_of(
10661 [&](
const std::unique_ptr<TreeEntry> &TE) {
10662 return ((!TE->isAltShuffle() &&
10663 TE->getOpcode() == Instruction::ExtractElement) ||
10665 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10666 return VL.size() > Data.index() &&
10667 (Mask[Data.index()] == PoisonMaskElem ||
10668 isa<UndefValue>(VL[Data.index()]) ||
10669 Data.value() == VL[Data.index()]);
10674 for (
unsigned Part : seq<unsigned>(NumParts)) {
10676 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10680 if (isa<UndefValue>(V) ||
10689 auto *EE = cast<ExtractElementInst>(V);
10690 VecBase = EE->getVectorOperand();
10691 UniqueBases.
insert(VecBase);
10692 const TreeEntry *VE = R.getTreeEntry(V);
10693 if (!CheckedExtracts.
insert(V).second ||
10694 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10697 return isa<GetElementPtrInst>(U) &&
10698 !R.areAllUsersVectorized(cast<Instruction>(U),
10706 unsigned Idx = *EEIdx;
10708 if (EE->hasOneUse() || !PrevNodeFound) {
10710 if (isa<SExtInst, ZExtInst>(Ext) &&
10711 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10716 EE->getVectorOperandType(),
Idx);
10719 Ext->getOpcode(), Ext->getType(), EE->getType(),
10734 if (!PrevNodeFound)
10735 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10738 transformMaskAfterShuffle(CommonMask, CommonMask);
10739 SameNodesEstimated =
false;
10740 if (NumParts != 1 && UniqueBases.
size() != 1) {
10741 UseVecBaseAsInput =
true;
10749 std::optional<InstructionCost>
10753 return std::nullopt;
10759 return Idx < static_cast<int>(E1.getVectorFactor());
10761 "Expected single vector shuffle mask.");
10765 if (InVectors.
empty()) {
10766 CommonMask.
assign(Mask.begin(), Mask.end());
10767 InVectors.
assign({&E1, &E2});
10770 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10773 if (NumParts == 0 || NumParts >= Mask.size() ||
10774 MaskVecTy->getNumElements() % NumParts != 0 ||
10776 MaskVecTy->getNumElements() / NumParts))
10781 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10782 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10785 if (InVectors.
empty()) {
10786 CommonMask.
assign(Mask.begin(), Mask.end());
10787 InVectors.
assign(1, &E1);
10790 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10793 if (NumParts == 0 || NumParts >= Mask.size() ||
10794 MaskVecTy->getNumElements() % NumParts != 0 ||
10796 MaskVecTy->getNumElements() / NumParts))
10801 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10802 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10803 if (!SameNodesEstimated && InVectors.
size() == 1)
10815 auto *EI = cast<ExtractElementInst>(
10816 cast<const TreeEntry *>(InVectors.
front())
10817 ->getOrdered(
P.index()));
10818 return EI->getVectorOperand() == V1 ||
10819 EI->getVectorOperand() == V2;
10821 "Expected extractelement vectors.");
10825 if (InVectors.
empty()) {
10827 "Expected empty input mask/vectors.");
10828 CommonMask.
assign(Mask.begin(), Mask.end());
10829 InVectors.
assign(1, V1);
10834 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10835 !CommonMask.
empty() &&
10838 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10839 ->getOrdered(
P.index());
10841 return P.value() == Mask[
P.index()] ||
10842 isa<UndefValue>(Scalar);
10843 if (isa<Constant>(V1))
10845 auto *EI = cast<ExtractElementInst>(Scalar);
10846 return EI->getVectorOperand() == V1;
10848 "Expected only tree entry for extractelement vectors.");
10852 "Expected only tree entries from extracts/reused buildvectors.");
10853 unsigned VF = getVF(V1);
10854 if (InVectors.
size() == 2) {
10855 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10856 transformMaskAfterShuffle(CommonMask, CommonMask);
10857 VF = std::max<unsigned>(VF, CommonMask.
size());
10858 }
else if (
const auto *InTE =
10859 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10860 VF = std::max(VF, InTE->getVectorFactor());
10863 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10864 ->getNumElements());
10867 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10869 CommonMask[
Idx] = Mask[
Idx] + VF;
10872 Value *Root =
nullptr) {
10873 Cost += getBuildVectorCost(VL, Root);
10877 unsigned VF = VL.
size();
10879 VF = std::min(VF, MaskVF);
10881 if (isa<UndefValue>(V)) {
10887 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10894 Type *ScalarTy = V->getType()->getScalarType();
10896 if (isa<PoisonValue>(V))
10898 else if (isa<UndefValue>(V))
10902 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10905 Vals.
swap(NewVals);
10911 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10918 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10921 IsFinalized =
true;
10924 if (InVectors.
size() == 2)
10925 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10927 Cost += createShuffle(Vec,
nullptr, CommonMask);
10928 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10932 "Expected vector length for the final value before action.");
10933 Value *V = cast<Value *>(Vec);
10934 Action(V, CommonMask);
10935 InVectors.
front() = V;
10937 if (!SubVectors.empty()) {
10939 if (InVectors.
size() == 2)
10940 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10942 Cost += createShuffle(Vec,
nullptr, CommonMask);
10943 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10947 if (!SubVectorsMask.
empty()) {
10949 "Expected same size of masks for subvectors and common mask.");
10951 copy(SubVectorsMask, SVMask.begin());
10952 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10955 I1 = I2 + CommonMask.
size();
10962 for (
auto [E,
Idx] : SubVectors) {
10963 Type *EScalarTy = E->Scalars.front()->getType();
10964 bool IsSigned =
true;
10965 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10968 IsSigned = It->second.second;
10970 if (ScalarTy != EScalarTy) {
10971 unsigned CastOpcode = Instruction::Trunc;
10972 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10973 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10975 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10985 if (!CommonMask.
empty()) {
10986 std::iota(std::next(CommonMask.
begin(),
Idx),
10987 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
10993 if (!ExtMask.
empty()) {
10994 if (CommonMask.
empty()) {
10998 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11001 NewMask[
I] = CommonMask[ExtMask[
I]];
11003 CommonMask.
swap(NewMask);
11006 if (CommonMask.
empty()) {
11007 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11011 createShuffle(InVectors.
front(),
11012 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11018 "Shuffle construction must be finalized.");
11022const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11023 unsigned Idx)
const {
11024 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11027 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11028 return TE->isGather() &&
11029 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11030 return EI.EdgeIdx == Idx && EI.UserTE == E;
11031 }) != TE->UserTreeIndices.end();
11033 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11038 if (TE.State == TreeEntry::ScatterVectorize ||
11039 TE.State == TreeEntry::StridedVectorize)
11041 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11042 !TE.isAltShuffle()) {
11043 if (TE.ReorderIndices.empty())
11057 const unsigned VF,
unsigned MinBW,
11089 auto It = MinBWs.
find(E);
11090 Type *OrigScalarTy = ScalarTy;
11091 if (It != MinBWs.
end()) {
11092 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11098 unsigned EntryVF = E->getVectorFactor();
11101 if (E->isGather()) {
11104 if (isa<InsertElementInst>(VL[0]))
11106 if (isa<CmpInst>(VL.
front()))
11107 ScalarTy = VL.
front()->getType();
11108 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11109 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11113 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11116 if (E->getOpcode() == Instruction::Store) {
11118 NewMask.
resize(E->ReorderIndices.size());
11119 copy(E->ReorderIndices, NewMask.
begin());
11125 if (!E->ReuseShuffleIndices.empty())
11126 ::addMask(Mask, E->ReuseShuffleIndices);
11130 assert((E->State == TreeEntry::Vectorize ||
11131 E->State == TreeEntry::ScatterVectorize ||
11132 E->State == TreeEntry::StridedVectorize) &&
11133 "Unhandled state");
11134 assert(E->getOpcode() &&
11136 (E->getOpcode() == Instruction::GetElementPtr &&
11137 E->getMainOp()->getType()->isPointerTy())) &&
11140 unsigned ShuffleOrOp =
11141 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11142 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11143 ShuffleOrOp = E->CombinedOp;
11145 const unsigned Sz = UniqueValues.
size();
11147 for (
unsigned I = 0;
I < Sz; ++
I) {
11148 if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11150 UsedScalars.set(
I);
11152 auto GetCastContextHint = [&](
Value *
V) {
11153 if (
const TreeEntry *OpTE = getTreeEntry(V))
11154 return getCastContextHint(*OpTE);
11155 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11156 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11157 !SrcState.isAltShuffle())
11166 if (isa<CastInst, CallInst>(VL0)) {
11170 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11172 for (
unsigned I = 0;
I < Sz; ++
I) {
11173 if (UsedScalars.test(
I))
11175 ScalarCost += ScalarEltCost(
I);
11184 (E->getOpcode() != Instruction::Load ||
11185 !E->UserTreeIndices.empty())) {
11186 const EdgeInfo &EI =
11187 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11188 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11190 if (EI.UserTE->getOpcode() != Instruction::Select ||
11192 auto UserBWIt = MinBWs.
find(EI.UserTE);
11193 Type *UserScalarTy =
11194 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11195 if (UserBWIt != MinBWs.
end())
11197 UserBWIt->second.first);
11198 if (ScalarTy != UserScalarTy) {
11199 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11200 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11201 unsigned VecOpcode;
11202 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11203 if (BWSz > SrcBWSz)
11204 VecOpcode = Instruction::Trunc;
11207 It->second.second ? Instruction::SExt : Instruction::ZExt;
11214 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11215 ScalarCost,
"Calculated costs for Tree"));
11216 return VecCost - ScalarCost;
11221 assert((E->State == TreeEntry::Vectorize ||
11222 E->State == TreeEntry::StridedVectorize) &&
11223 "Entry state expected to be Vectorize or StridedVectorize here.");
11227 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11228 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11229 "Calculated GEPs cost for Tree"));
11231 return VecCost - ScalarCost;
11238 Type *CanonicalType = Ty;
11245 {CanonicalType, CanonicalType});
11250 if (VI && SelectOnly) {
11252 "Expected only for scalar type.");
11253 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11255 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11256 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11257 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11259 return IntrinsicCost;
11261 switch (ShuffleOrOp) {
11262 case Instruction::PHI: {
11266 for (
Value *V : UniqueValues) {
11267 auto *
PHI = dyn_cast<PHINode>(V);
11272 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11276 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11278 if (!OpTE->ReuseShuffleIndices.empty())
11279 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11280 OpTE->Scalars.size());
11283 return CommonCost - ScalarCost;
11285 case Instruction::ExtractValue:
11286 case Instruction::ExtractElement: {
11287 auto GetScalarCost = [&](
unsigned Idx) {
11288 if (isa<PoisonValue>(UniqueValues[
Idx]))
11291 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11293 if (ShuffleOrOp == Instruction::ExtractElement) {
11294 auto *EE = cast<ExtractElementInst>(
I);
11295 SrcVecTy = EE->getVectorOperandType();
11297 auto *EV = cast<ExtractValueInst>(
I);
11298 Type *AggregateTy = EV->getAggregateOperand()->getType();
11300 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11301 NumElts = ATy->getNumElements();
11306 if (
I->hasOneUse()) {
11308 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11309 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11316 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11324 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11325 return GetCostDiff(GetScalarCost, GetVectorCost);
11327 case Instruction::InsertElement: {
11328 assert(E->ReuseShuffleIndices.empty() &&
11329 "Unique insertelements only are expected.");
11330 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11331 unsigned const NumElts = SrcVecTy->getNumElements();
11332 unsigned const NumScalars = VL.
size();
11338 unsigned OffsetEnd = OffsetBeg;
11339 InsertMask[OffsetBeg] = 0;
11342 if (OffsetBeg >
Idx)
11344 else if (OffsetEnd <
Idx)
11346 InsertMask[
Idx] =
I + 1;
11349 if (NumOfParts > 0 && NumOfParts < NumElts)
11350 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11351 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11353 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11354 unsigned InsertVecSz = std::min<unsigned>(
11356 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11357 bool IsWholeSubvector =
11358 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11362 if (OffsetBeg + InsertVecSz > VecSz) {
11365 InsertVecSz = VecSz;
11371 if (!E->ReorderIndices.empty()) {
11376 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11378 bool IsIdentity =
true;
11380 Mask.swap(PrevMask);
11381 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11383 DemandedElts.
setBit(InsertIdx);
11384 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11385 Mask[InsertIdx - OffsetBeg] =
I;
11387 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11401 InsertVecTy, Mask);
11402 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11403 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11411 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11412 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11413 if (InsertVecSz != VecSz) {
11424 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11433 case Instruction::ZExt:
11434 case Instruction::SExt:
11435 case Instruction::FPToUI:
11436 case Instruction::FPToSI:
11437 case Instruction::FPExt:
11438 case Instruction::PtrToInt:
11439 case Instruction::IntToPtr:
11440 case Instruction::SIToFP:
11441 case Instruction::UIToFP:
11442 case Instruction::Trunc:
11443 case Instruction::FPTrunc:
11444 case Instruction::BitCast: {
11445 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11448 unsigned Opcode = ShuffleOrOp;
11449 unsigned VecOpcode = Opcode;
11451 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11453 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11454 if (SrcIt != MinBWs.
end()) {
11455 SrcBWSz = SrcIt->second.first;
11462 if (BWSz == SrcBWSz) {
11463 VecOpcode = Instruction::BitCast;
11464 }
else if (BWSz < SrcBWSz) {
11465 VecOpcode = Instruction::Trunc;
11466 }
else if (It != MinBWs.
end()) {
11467 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11468 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11469 }
else if (SrcIt != MinBWs.
end()) {
11470 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11472 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11474 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11475 !SrcIt->second.second) {
11476 VecOpcode = Instruction::UIToFP;
11479 assert(
Idx == 0 &&
"Expected 0 index only");
11487 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11489 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11492 bool IsArithmeticExtendedReduction =
11493 E->Idx == 0 && UserIgnoreList &&
11495 auto *
I = cast<Instruction>(V);
11496 return is_contained({Instruction::Add, Instruction::FAdd,
11497 Instruction::Mul, Instruction::FMul,
11498 Instruction::And, Instruction::Or,
11502 if (IsArithmeticExtendedReduction &&
11503 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11505 return CommonCost +
11507 VecOpcode == Opcode ? VI :
nullptr);
11509 return GetCostDiff(GetScalarCost, GetVectorCost);
11511 case Instruction::FCmp:
11512 case Instruction::ICmp:
11513 case Instruction::Select: {
11517 match(VL0, MatchCmp))
11523 auto GetScalarCost = [&](
unsigned Idx) {
11524 if (isa<PoisonValue>(UniqueValues[
Idx]))
11527 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11533 !
match(VI, MatchCmp)) ||
11541 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11542 CostKind, getOperandInfo(
VI->getOperand(0)),
11543 getOperandInfo(
VI->getOperand(1)), VI);
11546 ScalarCost = IntrinsicCost;
11555 CostKind, getOperandInfo(E->getOperand(0)),
11556 getOperandInfo(E->getOperand(1)), VL0);
11557 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11560 unsigned CondNumElements = CondType->getNumElements();
11562 assert(VecTyNumElements >= CondNumElements &&
11563 VecTyNumElements % CondNumElements == 0 &&
11564 "Cannot vectorize Instruction::Select");
11565 if (CondNumElements != VecTyNumElements) {
11574 return VecCost + CommonCost;
11576 return GetCostDiff(GetScalarCost, GetVectorCost);
11578 case TreeEntry::MinMax: {
11579 auto GetScalarCost = [&](
unsigned Idx) {
11580 return GetMinMaxCost(OrigScalarTy);
11584 return VecCost + CommonCost;
11586 return GetCostDiff(GetScalarCost, GetVectorCost);
11588 case Instruction::FNeg:
11589 case Instruction::Add:
11590 case Instruction::FAdd:
11591 case Instruction::Sub:
11592 case Instruction::FSub:
11593 case Instruction::Mul:
11594 case Instruction::FMul:
11595 case Instruction::UDiv:
11596 case Instruction::SDiv:
11597 case Instruction::FDiv:
11598 case Instruction::URem:
11599 case Instruction::SRem:
11600 case Instruction::FRem:
11601 case Instruction::Shl:
11602 case Instruction::LShr:
11603 case Instruction::AShr:
11604 case Instruction::And:
11605 case Instruction::Or:
11606 case Instruction::Xor: {
11607 auto GetScalarCost = [&](
unsigned Idx) {
11608 if (isa<PoisonValue>(UniqueValues[
Idx]))
11611 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11612 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11621 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11622 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11625 auto *CI = dyn_cast<ConstantInt>(
Op);
11626 return CI && CI->getValue().countr_one() >= It->second.first;
11631 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11635 Op2Info, {},
nullptr, TLI) +
11638 return GetCostDiff(GetScalarCost, GetVectorCost);
11640 case Instruction::GetElementPtr: {
11641 return CommonCost + GetGEPCostDiff(VL, VL0);
11643 case Instruction::Load: {
11644 auto GetScalarCost = [&](
unsigned Idx) {
11645 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11647 VI->getAlign(),
VI->getPointerAddressSpace(),
11650 auto *LI0 = cast<LoadInst>(VL0);
11653 switch (E->State) {
11654 case TreeEntry::Vectorize:
11655 if (
unsigned Factor = E->getInterleaveFactor()) {
11657 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11658 LI0->getPointerAddressSpace(),
CostKind);
11662 Instruction::Load, VecTy, LI0->getAlign(),
11666 case TreeEntry::StridedVectorize: {
11667 Align CommonAlignment =
11668 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11670 Instruction::Load, VecTy, LI0->getPointerOperand(),
11671 false, CommonAlignment,
CostKind);
11674 case TreeEntry::ScatterVectorize: {
11675 Align CommonAlignment =
11676 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11678 Instruction::Load, VecTy, LI0->getPointerOperand(),
11679 false, CommonAlignment,
CostKind);
11682 case TreeEntry::CombinedVectorize:
11683 case TreeEntry::NeedToGather:
11686 return VecLdCost + CommonCost;
11692 if (E->State == TreeEntry::ScatterVectorize)
11698 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11699 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11701 case Instruction::Store: {
11702 bool IsReorder = !E->ReorderIndices.empty();
11703 auto GetScalarCost = [=](
unsigned Idx) {
11704 auto *
VI = cast<StoreInst>(VL[
Idx]);
11707 VI->getAlign(),
VI->getPointerAddressSpace(),
11711 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11715 if (E->State == TreeEntry::StridedVectorize) {
11716 Align CommonAlignment =
11717 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11719 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11720 false, CommonAlignment,
CostKind);
11722 assert(E->State == TreeEntry::Vectorize &&
11723 "Expected either strided or consecutive stores.");
11724 if (
unsigned Factor = E->getInterleaveFactor()) {
11725 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11726 "No reused shuffles expected");
11729 Instruction::Store, VecTy, Factor, std::nullopt,
11730 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11734 Instruction::Store, VecTy, BaseSI->getAlign(),
11735 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11738 return VecStCost + CommonCost;
11742 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11743 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11746 return GetCostDiff(GetScalarCost, GetVectorCost) +
11747 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11749 case Instruction::Call: {
11750 auto GetScalarCost = [&](
unsigned Idx) {
11751 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11762 auto *CI = cast<CallInst>(VL0);
11766 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11768 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11770 return GetCostDiff(GetScalarCost, GetVectorCost);
11772 case Instruction::ShuffleVector: {
11773 if (!
SLPReVec || E->isAltShuffle())
11774 assert(E->isAltShuffle() &&
11779 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11780 "Invalid Shuffle Vector Operand");
11783 auto TryFindNodeWithEqualOperands = [=]() {
11784 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11787 if (
TE->isAltShuffle() &&
11788 ((
TE->getOpcode() == E->getOpcode() &&
11789 TE->getAltOpcode() == E->getAltOpcode()) ||
11790 (
TE->getOpcode() == E->getAltOpcode() &&
11791 TE->getAltOpcode() == E->getOpcode())) &&
11792 TE->hasEqualOperands(*E))
11797 auto GetScalarCost = [&](
unsigned Idx) {
11798 if (isa<PoisonValue>(UniqueValues[
Idx]))
11801 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11802 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11812 if (TryFindNodeWithEqualOperands()) {
11814 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11821 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11823 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11824 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11826 VecCost = TTIRef.getCmpSelInstrCost(
11827 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11828 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11830 VecCost += TTIRef.getCmpSelInstrCost(
11831 E->getOpcode(), VecTy, MaskTy,
11832 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11833 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11836 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11839 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11840 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11842 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11843 if (SrcIt != MinBWs.
end()) {
11844 SrcBWSz = SrcIt->second.first;
11848 if (BWSz <= SrcBWSz) {
11849 if (BWSz < SrcBWSz)
11851 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11855 <<
"SLP: alternate extension, which should be truncated.\n";
11861 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11864 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11868 E->buildAltOpShuffleMask(
11870 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11881 unsigned Opcode0 = E->getOpcode();
11882 unsigned Opcode1 = E->getAltOpcode();
11886 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11888 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11889 return AltVecCost < VecCost ? AltVecCost : VecCost;
11894 if (
SLPReVec && !E->isAltShuffle())
11895 return GetCostDiff(
11900 "Not supported shufflevector usage.");
11901 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11902 unsigned SVNumElements =
11903 cast<FixedVectorType>(SV->getOperand(0)->getType())
11904 ->getNumElements();
11905 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11906 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11910 assert(isa<ShuffleVectorInst>(V) &&
11911 "Not supported shufflevector usage.");
11912 auto *SV = cast<ShuffleVectorInst>(V);
11914 [[maybe_unused]]
bool IsExtractSubvectorMask =
11915 SV->isExtractSubvectorMask(Index);
11916 assert(IsExtractSubvectorMask &&
11917 "Not supported shufflevector usage.");
11918 if (NextIndex != Index)
11920 NextIndex += SV->getShuffleMask().size();
11923 return ::getShuffleCost(
11929 return GetCostDiff(GetScalarCost, GetVectorCost);
11931 case Instruction::Freeze:
11938bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11940 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11942 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11944 return TE->isGather() &&
11946 [
this](
Value *V) { return EphValues.contains(V); }) &&
11948 TE->Scalars.size() < Limit ||
11949 ((
TE->getOpcode() == Instruction::ExtractElement ||
11950 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11952 (
TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()) ||
11953 any_of(
TE->Scalars, IsaPred<LoadInst>));
11957 if (VectorizableTree.size() == 1 &&
11958 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11959 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11961 AreVectorizableGathers(VectorizableTree[0].
get(),
11962 VectorizableTree[0]->Scalars.size()) &&
11963 VectorizableTree[0]->getVectorFactor() > 2)))
11966 if (VectorizableTree.size() != 2)
11974 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11975 AreVectorizableGathers(VectorizableTree[1].
get(),
11976 VectorizableTree[0]->Scalars.size()))
11980 if (VectorizableTree[0]->
isGather() ||
11981 (VectorizableTree[1]->isGather() &&
11982 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11983 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11991 bool MustMatchOrInst) {
11995 Value *ZextLoad = Root;
11996 const APInt *ShAmtC;
11997 bool FoundOr =
false;
11998 while (!isa<ConstantExpr>(ZextLoad) &&
12001 ShAmtC->
urem(8) == 0))) {
12002 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12003 ZextLoad = BinOp->getOperand(0);
12004 if (BinOp->getOpcode() == Instruction::Or)
12009 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12016 Type *SrcTy = Load->getType();
12023 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
12024 << *(cast<Instruction>(Root)) <<
"\n");
12033 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12034 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12042 unsigned NumElts = Stores.
size();
12043 for (
Value *Scalar : Stores) {
12057 if (VectorizableTree.empty()) {
12058 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12064 if (VectorizableTree.size() == 2 &&
12065 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12066 VectorizableTree[1]->isGather() &&
12067 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12068 !(
isSplat(VectorizableTree[1]->Scalars) ||
12076 constexpr int Limit = 4;
12078 !VectorizableTree.empty() &&
12079 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12080 return (TE->isGather() &&
12081 TE->getOpcode() != Instruction::ExtractElement &&
12082 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12083 TE->getOpcode() == Instruction::PHI;
12094 if (isFullyVectorizableTinyTree(ForReduction))
12099 bool IsAllowedSingleBVNode =
12100 VectorizableTree.size() > 1 ||
12101 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12102 !VectorizableTree.front()->isAltShuffle() &&
12103 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12104 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12106 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12107 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12108 return isa<ExtractElementInst, UndefValue>(V) ||
12109 (IsAllowedSingleBVNode &&
12110 !V->hasNUsesOrMore(UsesLimit) &&
12111 any_of(V->users(), IsaPred<InsertElementInst>));
12116 if (VectorizableTree.back()->isGather() &&
12117 VectorizableTree.back()->isAltShuffle() &&
12118 VectorizableTree.back()->getVectorFactor() > 2 &&
12120 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12122 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12123 VectorizableTree.back()->getVectorFactor()),
12136 constexpr unsigned SmallTree = 3;
12137 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12140 [](
const std::unique_ptr<TreeEntry> &TE) {
12141 return TE->isGather() &&
12142 TE->getOpcode() == Instruction::Load &&
12150 TreeEntry &E = *VectorizableTree[
Idx];
12153 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12167 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12180 for (
const auto &TEPtr : VectorizableTree) {
12181 if (TEPtr->State != TreeEntry::Vectorize)
12183 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12189 auto *NodeA = DT->
getNode(
A->getParent());
12190 auto *NodeB = DT->
getNode(
B->getParent());
12191 assert(NodeA &&
"Should only process reachable instructions");
12192 assert(NodeB &&
"Should only process reachable instructions");
12193 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12194 "Different nodes should have different DFS numbers");
12195 if (NodeA != NodeB)
12196 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12197 return B->comesBefore(
A);
12207 LiveValues.
erase(PrevInst);
12208 for (
auto &J : PrevInst->
operands()) {
12209 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12210 LiveValues.
insert(cast<Instruction>(&*J));
12214 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12215 for (
auto *
X : LiveValues)
12216 dbgs() <<
" " <<
X->getName();
12217 dbgs() <<
", Looking at ";
12222 unsigned NumCalls = 0;
12226 while (InstIt != PrevInstIt) {
12227 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12228 PrevInstIt = Inst->getParent()->rbegin();
12233 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12234 if (
II->isAssumeLikeIntrinsic())
12238 for (
auto &ArgOp :
II->args())
12239 Tys.push_back(ArgOp->getType());
12240 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12241 FMF = FPMO->getFastMathFlags();
12248 if (IntrCost < CallCost)
12255 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256 &*PrevInstIt != PrevInst)
12264 for (
auto *
II : LiveValues) {
12265 auto *ScalarTy =
II->getType();
12266 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267 ScalarTy = VectorTy->getElementType();
12285 const auto *I1 = IE1;
12286 const auto *I2 = IE2;
12298 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12300 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12301 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12303 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12304 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12311struct ValueSelect {
12312 template <
typename U>
12313 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12316 template <
typename U>
12317 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12335template <
typename T>
12341 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12343 auto VMIt = std::next(ShuffleMask.begin());
12346 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12348 if (!IsBaseUndef.
all()) {
12350 std::pair<T *, bool> Res =
12351 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12353 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12357 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12359 auto *V = ValueSelect::get<T *>(
Base);
12361 assert((!V || GetVF(V) == Mask.size()) &&
12362 "Expected base vector of VF number of elements.");
12363 Prev = Action(Mask, {
nullptr, Res.first});
12364 }
else if (ShuffleMask.size() == 1) {
12367 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12373 Prev = Action(Mask, {ShuffleMask.begin()->first});
12377 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12378 unsigned Vec2VF = GetVF(VMIt->first);
12379 if (Vec1VF == Vec2VF) {
12383 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12386 Mask[
I] = SecMask[
I] + Vec1VF;
12389 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12392 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12394 std::pair<T *, bool> Res2 =
12395 ResizeAction(VMIt->first, VMIt->second,
false);
12397 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12404 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12407 Prev = Action(Mask, {Res1.first, Res2.first});
12409 VMIt = std::next(VMIt);
12411 bool IsBaseNotUndef = !IsBaseUndef.
all();
12412 (void)IsBaseNotUndef;
12414 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12416 std::pair<T *, bool> Res =
12417 ResizeAction(VMIt->first, VMIt->second,
false);
12419 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12422 "Multiple uses of scalars.");
12423 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12428 Prev = Action(Mask, {Prev, Res.first});
12436template <
typename T>
struct ShuffledInsertData {
12447 << VectorizableTree.size() <<
".\n");
12449 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12452 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12453 TreeEntry &TE = *VectorizableTree[
I];
12456 if (TE.State == TreeEntry::CombinedVectorize) {
12458 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12459 << *TE.Scalars[0] <<
".\n";
12460 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12463 if (TE.isGather()) {
12464 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12465 E && E->getVectorFactor() == TE.getVectorFactor() &&
12466 E->isSame(TE.Scalars)) {
12471 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12478 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12479 "Expected gather nodes with users only.");
12485 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12494 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12501 for (ExternalUser &EU : ExternalUses) {
12502 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12504 for (ExternalUser &EU : ExternalUses) {
12508 if (EphValues.
count(EU.User))
12514 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12517 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12521 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12522 !ExtractCostCalculated.
insert(EU.Scalar).second)
12526 if (isa<FixedVectorType>(EU.Scalar->getType()))
12531 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12533 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12534 if (!UsedInserts.
insert(VU).second)
12538 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12541 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12546 Value *Op0 =
II->getOperand(0);
12547 if (getTreeEntry(
II) && !getTreeEntry(Op0))
12553 if (It == ShuffledInserts.
end()) {
12555 Data.InsertElements.emplace_back(VU);
12557 VecId = ShuffledInserts.
size() - 1;
12558 auto It = MinBWs.
find(ScalarTE);
12559 if (It != MinBWs.
end() &&
12561 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12563 unsigned BWSz = It->second.first;
12564 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12565 unsigned VecOpcode;
12566 if (DstBWSz < BWSz)
12567 VecOpcode = Instruction::Trunc;
12570 It->second.second ? Instruction::SExt : Instruction::ZExt;
12575 FTy->getNumElements()),
12578 <<
" for extending externally used vector with "
12579 "non-equal minimum bitwidth.\n");
12584 It->InsertElements.front() = VU;
12585 VecId = std::distance(ShuffledInserts.
begin(), It);
12587 int InIdx = *InsertIdx;
12589 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12592 Mask[InIdx] = EU.Lane;
12593 DemandedElts[VecId].setBit(InIdx);
12604 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12605 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12606 auto It = MinBWs.
find(Entry);
12607 if (It != MinBWs.
end()) {
12610 ? Instruction::ZExt
12611 : Instruction::SExt;
12618 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12621 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12622 Entry->getOpcode() == Instruction::Load) {
12624 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12625 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12626 auto *
I = cast<Instruction>(U.Scalar);
12627 const Loop *L = LI->getLoopFor(Phi->getParent());
12628 return L && (Phi->getParent() ==
I->getParent() ||
12629 L == LI->getLoopFor(
I->getParent()));
12633 if (!ValueToExtUses) {
12634 ValueToExtUses.emplace();
12637 if (IsPhiInLoop(
P.value()))
12640 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12645 auto *Inst = cast<Instruction>(EU.Scalar);
12647 auto OperandIsScalar = [&](
Value *V) {
12648 if (!getTreeEntry(V)) {
12652 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12653 return !EE->hasOneUse() || !MustGather.contains(EE);
12656 return ValueToExtUses->contains(V);
12658 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12659 bool CanBeUsedAsScalarCast =
false;
12660 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12661 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12662 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12664 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12667 if (ScalarCost + OpCost <= ExtraCost) {
12668 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12669 ScalarCost += OpCost;
12673 if (CanBeUsedAsScalar) {
12674 bool KeepScalar = ScalarCost <= ExtraCost;
12678 bool IsProfitablePHIUser =
12680 VectorizableTree.front()->Scalars.size() > 2)) &&
12681 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12685 auto *PHIUser = dyn_cast<PHINode>(U);
12686 return (!PHIUser ||
12687 PHIUser->getParent() !=
12689 VectorizableTree.front()->getMainOp())
12694 return ValueToExtUses->contains(V);
12696 if (IsProfitablePHIUser) {
12700 (!GatheredLoadsEntriesFirst.has_value() ||
12701 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12702 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12703 return ValueToExtUses->contains(V);
12705 auto It = ExtractsCount.
find(Entry);
12706 if (It != ExtractsCount.
end()) {
12707 assert(ScalarUsesCount >= It->getSecond().size() &&
12708 "Expected total number of external uses not less than "
12709 "number of scalar uses.");
12710 ScalarUsesCount -= It->getSecond().size();
12715 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12718 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12720 auto It = ValueToExtUses->find(V);
12721 if (It != ValueToExtUses->end()) {
12723 ExternalUses[It->second].User = nullptr;
12726 ExtraCost = ScalarCost;
12727 if (!IsPhiInLoop(EU))
12728 ExtractsCount[Entry].
insert(Inst);
12729 if (CanBeUsedAsScalarCast) {
12730 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12733 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12735 auto It = ValueToExtUses->find(V);
12736 if (It != ValueToExtUses->end()) {
12738 ExternalUses[It->second].User = nullptr;
12747 ExtractCost += ExtraCost;
12751 for (
Value *V : ScalarOpsFromCasts) {
12752 ExternalUsesAsOriginalScalar.
insert(V);
12753 if (
const TreeEntry *E = getTreeEntry(V)) {
12754 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12758 if (!VectorizedVals.
empty()) {
12759 const TreeEntry &Root = *VectorizableTree.front();
12760 auto BWIt = MinBWs.find(&Root);
12761 if (BWIt != MinBWs.end()) {
12762 Type *DstTy = Root.Scalars.front()->getType();
12765 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12766 if (OriginalSz != SrcSz) {
12767 unsigned Opcode = Instruction::Trunc;
12768 if (OriginalSz > SrcSz)
12769 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12771 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12783 Cost += SpillCost + ExtractCost;
12787 unsigned VF =
Mask.size();
12788 unsigned VecVF =
TE->getVectorFactor();
12790 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12793 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12799 dbgs() <<
"SLP: Adding cost " <<
C
12800 <<
" for final shuffle of insertelement external users.\n";
12801 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12803 return std::make_pair(TE,
true);
12805 return std::make_pair(TE,
false);
12808 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12809 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12810 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12814 assert((TEs.size() == 1 || TEs.size() == 2) &&
12815 "Expected exactly 1 or 2 tree entries.");
12816 if (TEs.size() == 1) {
12818 VF = TEs.front()->getVectorFactor();
12819 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12823 (
Data.index() < VF &&
12824 static_cast<int>(
Data.index()) ==
Data.value());
12829 <<
" for final shuffle of insertelement "
12830 "external users.\n";
12831 TEs.front()->
dump();
12832 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12838 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12839 VF = TEs.front()->getVectorFactor();
12843 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12847 <<
" for final shuffle of vector node and external "
12848 "insertelement users.\n";
12849 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12850 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12856 (void)performExtractsShuffleAction<const TreeEntry>(
12858 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12859 EstimateShufflesCost);
12861 cast<FixedVectorType>(
12862 ShuffledInserts[
I].InsertElements.front()->getType()),
12865 Cost -= InsertCost;
12869 if (ReductionBitWidth != 0) {
12870 assert(UserIgnoreList &&
"Expected reduction tree.");
12871 const TreeEntry &E = *VectorizableTree.front();
12872 auto It = MinBWs.find(&E);
12873 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12874 unsigned SrcSize = It->second.first;
12875 unsigned DstSize = ReductionBitWidth;
12876 unsigned Opcode = Instruction::Trunc;
12877 if (SrcSize < DstSize) {
12878 bool IsArithmeticExtendedReduction =
12880 auto *
I = cast<Instruction>(V);
12881 return is_contained({Instruction::Add, Instruction::FAdd,
12882 Instruction::Mul, Instruction::FMul,
12883 Instruction::And, Instruction::Or,
12887 if (IsArithmeticExtendedReduction)
12889 Instruction::BitCast;
12891 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12893 if (Opcode != Instruction::BitCast) {
12895 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12897 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12900 switch (E.getOpcode()) {
12901 case Instruction::SExt:
12902 case Instruction::ZExt:
12903 case Instruction::Trunc: {
12904 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12905 CCH = getCastContextHint(*OpTE);
12915 <<
" for final resize for reduction from " << SrcVecTy
12916 <<
" to " << DstVecTy <<
"\n";
12917 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12926 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12927 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12928 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12932 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12943std::optional<TTI::ShuffleKind>
12944BoUpSLP::tryToGatherSingleRegisterExtractElements(
12950 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12951 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12953 if (isa<UndefValue>(VL[
I]))
12957 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12958 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12971 ExtractMask.reset(*
Idx);
12976 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12981 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12982 return P1.second.size() > P2.second.size();
12985 const int UndefSz = UndefVectorExtracts.
size();
12986 unsigned SingleMax = 0;
12987 unsigned PairMax = 0;
12988 if (!Vectors.
empty()) {
12989 SingleMax = Vectors.
front().second.size() + UndefSz;
12990 if (Vectors.
size() > 1) {
12991 auto *ItNext = std::next(Vectors.
begin());
12992 PairMax = SingleMax + ItNext->second.size();
12995 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12996 return std::nullopt;
13002 if (SingleMax >= PairMax && SingleMax) {
13003 for (
int Idx : Vectors.
front().second)
13005 }
else if (!Vectors.
empty()) {
13006 for (
unsigned Idx : {0, 1})
13007 for (
int Idx : Vectors[
Idx].second)
13011 for (
int Idx : UndefVectorExtracts)
13015 std::optional<TTI::ShuffleKind> Res =
13021 return std::nullopt;
13025 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13026 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
13027 isa<UndefValue>(GatheredExtracts[
I])) {
13031 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13032 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13033 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13048 unsigned NumParts)
const {
13049 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13053 for (
unsigned Part : seq<unsigned>(NumParts)) {
13059 std::optional<TTI::ShuffleKind> Res =
13060 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13061 ShufflesRes[Part] = Res;
13062 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13064 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13065 return Res.has_value();
13067 ShufflesRes.clear();
13068 return ShufflesRes;
13071std::optional<TargetTransformInfo::ShuffleKind>
13072BoUpSLP::isGatherShuffledSingleRegisterEntry(
13078 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13079 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13080 :
TE->UserTreeIndices.front();
13081 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13085 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13086 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13089 TEInsertBlock = TEInsertPt->
getParent();
13092 return std::nullopt;
13093 auto *NodeUI = DT->
getNode(TEInsertBlock);
13094 assert(NodeUI &&
"Should only process reachable instructions");
13096 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13110 auto *NodeEUI = DT->
getNode(InsertBlock);
13113 assert((NodeUI == NodeEUI) ==
13114 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13115 "Different nodes should have different DFS numbers");
13117 if (TEInsertPt->
getParent() != InsertBlock &&
13120 if (TEInsertPt->
getParent() == InsertBlock &&
13134 for (
Value *V : VL) {
13139 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13140 if (TEPtr == TE || TEPtr->Idx == 0)
13143 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13144 "Must contain at least single gathered value.");
13145 assert(TEPtr->UserTreeIndices.size() == 1 &&
13146 "Expected only single user of a gather node.");
13147 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13149 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13152 : &getLastInstructionInBundle(UseEI.UserTE);
13153 if (TEInsertPt == InsertPt) {
13157 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13161 if (TEUseEI.UserTE != UseEI.UserTE &&
13162 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13168 if ((TEInsertBlock != InsertPt->
getParent() ||
13169 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13170 !CheckOrdering(InsertPt))
13174 if (
const TreeEntry *VTE = getTreeEntry(V)) {
13175 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13176 if (VTE->State != TreeEntry::Vectorize) {
13177 auto It = MultiNodeScalars.
find(V);
13178 if (It == MultiNodeScalars.
end())
13180 VTE = *It->getSecond().begin();
13182 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13183 return MTE->State == TreeEntry::Vectorize;
13185 if (MIt == It->getSecond().end())
13190 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13191 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13195 if (VToTEs.
empty())
13197 if (UsedTEs.
empty()) {
13211 if (!VToTEs.
empty()) {
13217 VToTEs = SavedVToTEs;
13226 if (UsedTEs.
size() == 2)
13228 UsedTEs.push_back(SavedVToTEs);
13235 if (UsedTEs.
empty()) {
13237 return std::nullopt;
13241 if (UsedTEs.
size() == 1) {
13244 UsedTEs.front().
end());
13245 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13246 return TE1->Idx < TE2->Idx;
13249 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13250 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13252 if (It != FirstEntries.end() &&
13253 ((*It)->getVectorFactor() == VL.size() ||
13254 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13255 TE->ReuseShuffleIndices.size() == VL.size() &&
13256 (*It)->isSame(
TE->Scalars)))) {
13257 Entries.push_back(*It);
13258 if ((*It)->getVectorFactor() == VL.size()) {
13259 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13260 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13266 for (
unsigned I : seq<unsigned>(VL.size()))
13267 if (isa<PoisonValue>(VL[
I]))
13273 Entries.push_back(FirstEntries.front());
13274 VF = FirstEntries.front()->getVectorFactor();
13277 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13280 for (
const TreeEntry *TE : UsedTEs.front()) {
13281 unsigned VF =
TE->getVectorFactor();
13282 auto It = VFToTE.
find(VF);
13283 if (It != VFToTE.
end()) {
13284 if (It->second->Idx >
TE->Idx)
13285 It->getSecond() =
TE;
13292 UsedTEs.back().
end());
13293 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13294 return TE1->Idx < TE2->Idx;
13296 for (
const TreeEntry *TE : SecondEntries) {
13297 auto It = VFToTE.
find(
TE->getVectorFactor());
13298 if (It != VFToTE.
end()) {
13300 Entries.push_back(It->second);
13301 Entries.push_back(TE);
13307 if (Entries.empty()) {
13309 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13310 return TE1->Idx < TE2->Idx;
13312 Entries.push_back(SecondEntries.front());
13313 VF = std::max(Entries.front()->getVectorFactor(),
13314 Entries.back()->getVectorFactor());
13316 VF = Entries.front()->getVectorFactor();
13320 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13323 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13324 auto *
PHI = cast<PHINode>(V);
13325 auto *PHI1 = cast<PHINode>(V1);
13330 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13332 Value *In1 = PHI1->getIncomingValue(
I);
13337 if (cast<Instruction>(In)->
getParent() !=
13347 auto MightBeIgnored = [=](
Value *
V) {
13348 auto *
I = dyn_cast<Instruction>(V);
13349 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13351 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13356 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13358 bool UsedInSameVTE =
false;
13359 auto It = UsedValuesEntry.
find(V1);
13360 if (It != UsedValuesEntry.
end())
13361 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13362 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13364 cast<Instruction>(V)->getParent() ==
13365 cast<Instruction>(V1)->getParent() &&
13366 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13371 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13373 auto It = UsedValuesEntry.
find(V);
13374 if (It == UsedValuesEntry.
end())
13380 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13381 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13383 unsigned Idx = It->second;
13390 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13391 if (!UsedIdxs.test(
I))
13397 for (std::pair<unsigned, int> &Pair : EntryLanes)
13398 if (Pair.first ==
I)
13399 Pair.first = TempEntries.
size();
13402 Entries.swap(TempEntries);
13403 if (EntryLanes.size() == Entries.size() &&
13405 .
slice(Part * VL.size(),
13406 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13412 return std::nullopt;
13415 bool IsIdentity = Entries.size() == 1;
13418 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13419 unsigned Idx = Part * VL.size() + Pair.second;
13422 (ForOrder ? std::distance(
13423 Entries[Pair.first]->Scalars.begin(),
13424 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13425 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13426 IsIdentity &=
Mask[
Idx] == Pair.second;
13428 if (ForOrder || IsIdentity || Entries.empty()) {
13429 switch (Entries.size()) {
13431 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13435 if (EntryLanes.size() > 2 || VL.size() <= 2)
13441 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13442 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13445 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13446 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13447 for (
int Idx : SubMask) {
13455 assert(MaxElement >= 0 && MinElement >= 0 &&
13456 MaxElement % VF >= MinElement % VF &&
13457 "Expected at least single element.");
13458 unsigned NewVF = std::max<unsigned>(
13460 (MaxElement % VF) -
13461 (MinElement % VF) + 1));
13466 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13467 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13475 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13476 auto GetShuffleCost = [&,
13480 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13482 Mask, Entries.front()->getInterleaveFactor()))
13484 return ::getShuffleCost(
TTI,
13489 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13492 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13493 FirstShuffleCost = ShuffleCost;
13497 bool IsIdentity =
true;
13499 if (
Idx >=
static_cast<int>(NewVF)) {
13504 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13508 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13510 MaskVecTy, DemandedElts,
true,
13515 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13516 SecondShuffleCost = ShuffleCost;
13520 bool IsIdentity =
true;
13522 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13528 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13533 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13535 MaskVecTy, DemandedElts,
true,
13545 const TreeEntry *BestEntry =
nullptr;
13546 if (FirstShuffleCost < ShuffleCost) {
13547 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13548 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13550 if (Idx >= static_cast<int>(VF))
13551 Idx = PoisonMaskElem;
13553 BestEntry = Entries.front();
13554 ShuffleCost = FirstShuffleCost;
13556 if (SecondShuffleCost < ShuffleCost) {
13557 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13558 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13560 if (Idx < static_cast<int>(VF))
13561 Idx = PoisonMaskElem;
13565 BestEntry = Entries[1];
13566 ShuffleCost = SecondShuffleCost;
13568 if (BuildVectorCost >= ShuffleCost) {
13571 Entries.push_back(BestEntry);
13579 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13581 return std::nullopt;
13585BoUpSLP::isGatherShuffledEntry(
13589 assert(NumParts > 0 && NumParts < VL.
size() &&
13590 "Expected positive number of registers.");
13593 if (TE == VectorizableTree.front().get() &&
13594 (!GatheredLoadsEntriesFirst.has_value() ||
13596 [](
const std::unique_ptr<TreeEntry> &TE) {
13597 return !
TE->isGather();
13601 if (
TE->isNonPowOf2Vec())
13604 assert((
TE->UserTreeIndices.size() == 1 ||
13605 TE == VectorizableTree.front().get()) &&
13606 "Expected only single user of the gather node.");
13608 "Number of scalars must be divisible by NumParts.");
13609 if (!
TE->UserTreeIndices.empty() &&
13610 TE->UserTreeIndices.front().UserTE->isGather() &&
13611 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13612 assert((
TE->Idx == 0 ||
TE->getOpcode() == Instruction::ExtractElement ||
13614 "Expected splat or extractelements only node.");
13619 for (
unsigned Part : seq<unsigned>(NumParts)) {
13623 std::optional<TTI::ShuffleKind> SubRes =
13624 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13627 SubEntries.
clear();
13630 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13631 (SubEntries.
front()->isSame(
TE->Scalars) ||
13632 SubEntries.
front()->isSame(VL))) {
13634 LocalSubEntries.
swap(SubEntries);
13637 std::iota(
Mask.begin(),
Mask.end(), 0);
13639 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13640 if (isa<PoisonValue>(VL[
I]))
13642 Entries.emplace_back(1, LocalSubEntries.
front());
13648 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13656 Type *ScalarTy)
const {
13658 bool DuplicateNonConst =
false;
13666 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13667 if (
V->getType() != ScalarTy) {
13678 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13681 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13689 EstimateInsertCost(
I, V);
13690 ShuffleMask[
I] =
I;
13694 DuplicateNonConst =
true;
13696 ShuffleMask[
I] = Res.first->second;
13698 if (ForPoisonSrc) {
13699 if (isa<FixedVectorType>(ScalarTy)) {
13705 for (
unsigned I : seq<unsigned>(VL.
size()))
13706 if (!ShuffledElements[
I])
13709 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13717 if (DuplicateNonConst)
13719 VecTy, ShuffleMask);
13723Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13724 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13730 auto *Front = E->getMainOp();
13732 assert(((GatheredLoadsEntriesFirst.has_value() &&
13733 E->getOpcode() == Instruction::Load && E->isGather() &&
13734 E->Idx < *GatheredLoadsEntriesFirst) ||
13736 [=](
Value *V) ->
bool {
13737 if (E->getOpcode() == Instruction::GetElementPtr &&
13738 !isa<GetElementPtrInst>(V))
13740 auto *I = dyn_cast<Instruction>(V);
13741 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13742 isVectorLikeInstWithConstOps(I);
13744 "Expected gathered loads or GEPs or instructions from same basic "
13747 auto FindLastInst = [&]() {
13749 for (
Value *V : E->Scalars) {
13750 auto *
I = dyn_cast<Instruction>(V);
13753 if (LastInst->
getParent() ==
I->getParent()) {
13758 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13759 !isa<GetElementPtrInst>(
I)) ||
13762 (GatheredLoadsEntriesFirst.has_value() &&
13763 E->getOpcode() == Instruction::Load && E->isGather() &&
13764 E->Idx < *GatheredLoadsEntriesFirst)) &&
13765 "Expected vector-like or non-GEP in GEP node insts only.");
13773 auto *NodeB = DT->
getNode(
I->getParent());
13774 assert(NodeA &&
"Should only process reachable instructions");
13775 assert(NodeB &&
"Should only process reachable instructions");
13776 assert((NodeA == NodeB) ==
13777 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13778 "Different nodes should have different DFS numbers");
13779 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13786 auto FindFirstInst = [&]() {
13788 for (
Value *V : E->Scalars) {
13789 auto *
I = dyn_cast<Instruction>(V);
13792 if (FirstInst->
getParent() ==
I->getParent()) {
13793 if (
I->comesBefore(FirstInst))
13797 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13798 !isa<GetElementPtrInst>(
I)) ||
13801 "Expected vector-like or non-GEP in GEP node insts only.");
13809 auto *NodeB = DT->
getNode(
I->getParent());
13810 assert(NodeA &&
"Should only process reachable instructions");
13811 assert(NodeB &&
"Should only process reachable instructions");
13812 assert((NodeA == NodeB) ==
13813 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13814 "Different nodes should have different DFS numbers");
13815 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13822 if (GatheredLoadsEntriesFirst.has_value() &&
13823 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13824 E->getOpcode() == Instruction::Load) {
13825 Res = FindFirstInst();
13833 if ((E->getOpcode() == Instruction::GetElementPtr &&
13836 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13840 return isa<PoisonValue>(V) ||
13841 (!isVectorLikeInstWithConstOps(V) &&
13842 isUsedOutsideBlock(V));
13844 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13845 return isa<ExtractElementInst, UndefValue>(V) ||
13846 areAllOperandsNonInsts(V);
13848 Res = FindLastInst();
13850 Res = FindFirstInst();
13858 if (BlocksSchedules.count(BB) && !E->isGather()) {
13859 Value *
V = E->isOneOf(E->Scalars.back());
13862 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13863 if (Bundle && Bundle->isPartOfBundle())
13864 for (; Bundle; Bundle = Bundle->NextInBundle)
13865 Res = Bundle->Inst;
13887 Res = FindLastInst();
13888 assert(Res &&
"Failed to find last instruction in bundle");
13892void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13893 auto *Front = E->getMainOp();
13894 Instruction *LastInst = &getLastInstructionInBundle(E);
13895 assert(LastInst &&
"Failed to find last instruction in bundle");
13898 bool IsPHI = isa<PHINode>(LastInst);
13900 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13902 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13906 Builder.SetInsertPoint(
13910 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13913Value *BoUpSLP::gather(
13922 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13925 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13926 InsertBB = InsertBB->getSinglePredecessor();
13927 return InsertBB && InsertBB == InstBB;
13929 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13930 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13931 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13932 getTreeEntry(Inst) ||
13933 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13934 PostponedIndices.
insert(
I).second)
13938 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13941 if (
Scalar->getType() != Ty) {
13945 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13946 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13948 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13949 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13952 Scalar = Builder.CreateIntCast(
13957 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13961 auto *
II = dyn_cast<IntrinsicInst>(InsElt);
13962 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13965 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13966 InsElt = dyn_cast<InsertElementInst>(Vec);
13970 GatherShuffleExtractSeq.
insert(InsElt);
13973 if (isa<Instruction>(V)) {
13974 if (TreeEntry *Entry = getTreeEntry(V)) {
13976 User *UserOp =
nullptr;
13978 if (
auto *SI = dyn_cast<Instruction>(Scalar))
13984 unsigned FoundLane =
Entry->findLaneForValue(V);
13985 ExternalUses.emplace_back(V, UserOp, FoundLane);
13995 std::iota(
Mask.begin(),
Mask.end(), 0);
13996 Value *OriginalRoot = Root;
13997 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13998 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13999 SV->getOperand(0)->getType() == VecTy) {
14000 Root = SV->getOperand(0);
14001 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14004 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
14011 if (isa<PoisonValue>(VL[
I]))
14013 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14017 if (isa<PoisonValue>(Vec)) {
14018 Vec = OriginalRoot;
14020 Vec = CreateShuffle(Root, Vec, Mask);
14021 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14022 OI && OI->hasNUses(0) &&
14023 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14024 return TE->VectorizedValue == OI;
14030 for (
int I : NonConsts)
14031 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14034 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14035 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14073 bool IsFinalized =
false;
14086 class ShuffleIRBuilder {
14099 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14100 CSEBlocks(CSEBlocks),
DL(
DL) {}
14101 ~ShuffleIRBuilder() =
default;
14104 if (V1->
getType() != V2->getType()) {
14107 "Expected integer vector types only.");
14108 if (V1->
getType() != V2->getType()) {
14109 if (cast<VectorType>(V2->getType())
14111 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14113 ->getIntegerBitWidth())
14122 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14123 GatherShuffleExtractSeq.
insert(
I);
14124 CSEBlocks.
insert(
I->getParent());
14133 unsigned VF = Mask.size();
14134 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14138 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14139 GatherShuffleExtractSeq.
insert(
I);
14140 CSEBlocks.
insert(
I->getParent());
14144 Value *createIdentity(
Value *V) {
return V; }
14145 Value *createPoison(
Type *Ty,
unsigned VF) {
14150 void resizeToMatch(
Value *&V1,
Value *&V2) {
14151 if (V1->
getType() == V2->getType())
14153 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14154 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14155 int VF = std::max(V1VF, V2VF);
14156 int MinVF = std::min(V1VF, V2VF);
14158 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14160 Value *&
Op = MinVF == V1VF ? V1 : V2;
14162 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14163 GatherShuffleExtractSeq.
insert(
I);
14164 CSEBlocks.
insert(
I->getParent());
14177 assert(V1 &&
"Expected at least one vector value.");
14178 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14179 R.CSEBlocks, *R.DL);
14180 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14188 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14196 std::optional<bool> IsSigned = std::nullopt) {
14197 auto *VecTy = cast<VectorType>(V->getType());
14208 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14212 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14213 unsigned NumParts,
bool &UseVecBaseAsInput) {
14214 UseVecBaseAsInput =
false;
14216 Value *VecBase =
nullptr;
14218 if (!E->ReorderIndices.empty()) {
14220 E->ReorderIndices.end());
14223 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14227 auto *EI = cast<ExtractElementInst>(VL[
I]);
14228 VecBase = EI->getVectorOperand();
14229 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14230 VecBase = TE->VectorizedValue;
14231 assert(VecBase &&
"Expected vectorized value.");
14232 UniqueBases.
insert(VecBase);
14235 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14236 (NumParts != 1 &&
count(VL, EI) > 1) ||
14238 const TreeEntry *UTE = R.getTreeEntry(U);
14239 return !UTE || R.MultiNodeScalars.contains(U) ||
14240 (isa<GetElementPtrInst>(U) &&
14241 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14242 count_if(R.VectorizableTree,
14243 [&](const std::unique_ptr<TreeEntry> &TE) {
14244 return any_of(TE->UserTreeIndices,
14245 [&](const EdgeInfo &Edge) {
14246 return Edge.UserTE == UTE;
14248 is_contained(VL, EI);
14252 R.eraseInstruction(EI);
14254 if (NumParts == 1 || UniqueBases.
size() == 1) {
14255 assert(VecBase &&
"Expected vectorized value.");
14256 return castToScalarTyElem(VecBase);
14258 UseVecBaseAsInput =
true;
14268 Value *Vec =
nullptr;
14271 for (
unsigned Part : seq<unsigned>(NumParts)) {
14275 constexpr int MaxBases = 2;
14277 auto VLMask =
zip(SubVL, SubMask);
14278 const unsigned VF = std::accumulate(
14279 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14280 if (std::get<1>(D) == PoisonMaskElem)
14283 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14284 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14285 VecOp = TE->VectorizedValue;
14286 assert(VecOp &&
"Expected vectorized value.");
14287 const unsigned Size =
14288 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14289 return std::max(S, Size);
14291 for (
const auto [V,
I] : VLMask) {
14294 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14295 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14296 VecOp = TE->VectorizedValue;
14297 assert(VecOp &&
"Expected vectorized value.");
14298 VecOp = castToScalarTyElem(VecOp);
14299 Bases[
I / VF] = VecOp;
14301 if (!Bases.front())
14304 if (Bases.back()) {
14305 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14306 TransformToIdentity(SubMask);
14308 SubVec = Bases.front();
14315 Mask.slice(
P * SliceSize,
14322 "Expected first part or all previous parts masked.");
14323 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14326 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14328 unsigned SubVecVF =
14329 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14330 NewVF = std::max(NewVF, SubVecVF);
14333 for (
int &
Idx : SubMask)
14336 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14337 Vec = createShuffle(Vec, SubVec, VecMask);
14338 TransformToIdentity(VecMask);
14346 std::optional<Value *>
14352 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14354 return std::nullopt;
14357 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14366 Value *V1 = E1.VectorizedValue;
14368 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14369 if (isa<PoisonValue>(V))
14371 return !isKnownNonNegative(
14372 V, SimplifyQuery(*R.DL));
14374 Value *V2 = E2.VectorizedValue;
14375 if (V2->getType()->isIntOrIntVectorTy())
14376 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14377 if (isa<PoisonValue>(V))
14379 return !isKnownNonNegative(
14380 V, SimplifyQuery(*R.DL));
14387 Value *V1 = E1.VectorizedValue;
14389 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14390 if (isa<PoisonValue>(V))
14392 return !isKnownNonNegative(
14393 V, SimplifyQuery(*R.DL));
14399 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14401 isa<FixedVectorType>(V2->getType()) &&
14402 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14403 V1 = castToScalarTyElem(V1);
14404 V2 = castToScalarTyElem(V2);
14405 if (InVectors.
empty()) {
14408 CommonMask.
assign(Mask.begin(), Mask.end());
14412 if (InVectors.
size() == 2) {
14413 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14414 transformMaskAfterShuffle(CommonMask, CommonMask);
14415 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14417 Vec = createShuffle(Vec,
nullptr, CommonMask);
14418 transformMaskAfterShuffle(CommonMask, CommonMask);
14420 V1 = createShuffle(V1, V2, Mask);
14421 unsigned VF = std::max(getVF(V1), getVF(Vec));
14422 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14424 CommonMask[
Idx] =
Idx + VF;
14425 InVectors.
front() = Vec;
14426 if (InVectors.
size() == 2)
14427 InVectors.
back() = V1;
14434 "castToScalarTyElem expects V1 to be FixedVectorType");
14435 V1 = castToScalarTyElem(V1);
14436 if (InVectors.
empty()) {
14438 CommonMask.
assign(Mask.begin(), Mask.end());
14441 const auto *It =
find(InVectors, V1);
14442 if (It == InVectors.
end()) {
14443 if (InVectors.
size() == 2 ||
14446 if (InVectors.
size() == 2) {
14447 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14448 transformMaskAfterShuffle(CommonMask, CommonMask);
14449 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14450 CommonMask.
size()) {
14451 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14452 transformMaskAfterShuffle(CommonMask, CommonMask);
14454 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14455 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14458 V->getType() != V1->
getType()
14460 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14461 ->getNumElements();
14462 if (V->getType() != V1->
getType())
14463 V1 = createShuffle(V1,
nullptr, Mask);
14464 InVectors.
front() = V;
14465 if (InVectors.
size() == 2)
14466 InVectors.
back() = V1;
14473 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14479 int VF = getVF(V1);
14480 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14482 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
14491 Value *Root =
nullptr) {
14492 return R.gather(VL, Root, ScalarTy,
14494 return createShuffle(V1, V2, Mask);
14503 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14506 IsFinalized =
true;
14509 if (ScalarTyNumElements != 1) {
14513 ExtMask = NewExtMask;
14517 if (InVectors.
size() == 2) {
14518 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14521 Vec = createShuffle(Vec,
nullptr, CommonMask);
14523 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14527 "Expected vector length for the final value before action.");
14528 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14531 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14532 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14534 Action(Vec, CommonMask);
14535 InVectors.
front() = Vec;
14537 if (!SubVectors.empty()) {
14539 if (InVectors.
size() == 2) {
14540 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14543 Vec = createShuffle(Vec,
nullptr, CommonMask);
14545 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14548 auto CreateSubVectors = [&](
Value *Vec,
14550 for (
auto [E,
Idx] : SubVectors) {
14551 Value *
V = E->VectorizedValue;
14552 if (
V->getType()->isIntOrIntVectorTy())
14553 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14554 if (isa<PoisonValue>(V))
14556 return !isKnownNonNegative(
14557 V, SimplifyQuery(*R.DL));
14559 unsigned InsertionIndex =
Idx * ScalarTyNumElements;
14561 Builder, Vec, V, InsertionIndex,
14562 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14564 if (!CommonMask.
empty()) {
14566 std::next(CommonMask.
begin(), InsertionIndex),
14567 std::next(CommonMask.
begin(),
14568 (
Idx + E->getVectorFactor()) * ScalarTyNumElements),
14574 if (SubVectorsMask.
empty()) {
14575 Vec = CreateSubVectors(Vec, CommonMask);
14578 copy(SubVectorsMask, SVMask.begin());
14579 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14582 I1 = I2 + CommonMask.
size();
14587 Vec = createShuffle(InsertVec, Vec, SVMask);
14588 for (
unsigned I : seq<unsigned>(CommonMask.
size())) {
14593 InVectors.
front() = Vec;
14596 if (!ExtMask.
empty()) {
14597 if (CommonMask.
empty()) {
14601 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14604 NewMask[
I] = CommonMask[ExtMask[
I]];
14606 CommonMask.
swap(NewMask);
14609 if (CommonMask.
empty()) {
14610 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14611 return InVectors.
front();
14613 if (InVectors.
size() == 2)
14614 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14615 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14620 "Shuffle construction must be finalized.");
14624BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14625 unsigned NodeIdx) {
14629 if (!S && VL.
front()->getType()->isPointerTy()) {
14630 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14631 if (It != VL.
end())
14636 auto CheckSameVE = [&](
const TreeEntry *VE) {
14637 return VE->isSame(VL) &&
14638 (
any_of(VE->UserTreeIndices,
14639 [E, NodeIdx](
const EdgeInfo &EI) {
14640 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14642 any_of(VectorizableTree,
14643 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14644 return TE->isOperandGatherNode(
14645 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14646 VE->isSame(TE->Scalars);
14649 TreeEntry *VE = getTreeEntry(S.getMainOp());
14650 if (VE && CheckSameVE(VE))
14652 auto It = MultiNodeScalars.
find(S.getMainOp());
14653 if (It != MultiNodeScalars.
end()) {
14654 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14655 return TE != VE && CheckSameVE(TE);
14657 if (
I != It->getSecond().end())
14663Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14664 bool PostponedPHIs) {
14665 ValueList &VL = E->getOperand(NodeIdx);
14666 const unsigned VF = VL.size();
14667 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14672 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14674 ShuffleInstructionBuilder ShuffleBuilder(
14678 ShuffleBuilder.add(V, Mask);
14680 E->CombinedEntriesWithIndices.size());
14681 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14682 [&](
const auto &
P) {
14683 return std::make_pair(VectorizableTree[P.first].get(),
14686 assert((E->CombinedEntriesWithIndices.empty() ||
14687 E->ReorderIndices.empty()) &&
14688 "Expected either combined subnodes or reordering");
14689 return ShuffleBuilder.finalize({}, SubVectors, {});
14693 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14694 if (!VE->ReuseShuffleIndices.empty()) {
14715 if (isa<PoisonValue>(V))
14717 Mask[
I] = VE->findLaneForValue(V);
14719 V = FinalShuffle(V, Mask);
14721 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14722 "Expected vectorization factor less "
14723 "than original vector size.");
14725 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14726 V = FinalShuffle(V, UniformMask);
14732 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14733 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14734 }) == VE->UserTreeIndices.end()) {
14736 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14737 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14738 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14740 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14741 (*It)->VectorizedValue =
V;
14749 auto *
I =
find_if(VectorizableTree,
14750 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14751 return TE->isOperandGatherNode({E, NodeIdx});
14753 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14754 assert(
I->get()->UserTreeIndices.size() == 1 &&
14755 "Expected only single user for the gather node.");
14756 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14760template <
typename BVTy,
typename ResTy,
typename...
Args>
14761ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14763 assert(E->isGather() &&
"Expected gather node.");
14764 unsigned VF = E->getVectorFactor();
14766 bool NeedFreeze =
false;
14768 E->ReuseShuffleIndices.end());
14771 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14773 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14776 E->CombinedEntriesWithIndices.size());
14777 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14778 [&](
const auto &
P) {
14779 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14784 E->ReorderIndices.end());
14785 if (!ReorderMask.empty())
14791 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14792 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14793 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14796 SubVectorsMask.
clear();
14800 unsigned I,
unsigned SliceSize,
14801 bool IsNotPoisonous) {
14803 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14806 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14807 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14808 if (UserTE->getNumOperands() != 2)
14810 if (!IsNotPoisonous) {
14812 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14813 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14814 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14815 }) !=
TE->UserTreeIndices.end();
14817 if (It == VectorizableTree.end())
14820 if (!(*It)->ReorderIndices.empty()) {
14824 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14825 Value *V0 = std::get<0>(
P);
14826 Value *V1 = std::get<1>(
P);
14827 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14828 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14834 if ((
Mask.size() < InputVF &&
14837 (
Mask.size() == InputVF &&
14840 std::next(
Mask.begin(),
I * SliceSize),
14841 std::next(
Mask.begin(),
14848 std::next(
Mask.begin(),
I * SliceSize),
14849 std::next(
Mask.begin(),
14855 BVTy ShuffleBuilder(ScalarTy, Params...);
14856 ResTy Res = ResTy();
14860 Value *ExtractVecBase =
nullptr;
14861 bool UseVecBaseAsInput =
false;
14864 Type *OrigScalarTy = GatheredScalars.front()->getType();
14867 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14872 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14874 bool Resized =
false;
14876 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14877 if (!ExtractShuffles.
empty()) {
14882 if (
const auto *TE = getTreeEntry(
14883 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14886 if (std::optional<ResTy> Delayed =
14887 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14889 PostponedGathers.
insert(E);
14894 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14895 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14896 ExtractVecBase = VecBase;
14897 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14898 if (VF == VecBaseTy->getNumElements() &&
14899 GatheredScalars.size() != VF) {
14901 GatheredScalars.append(VF - GatheredScalars.size(),
14907 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
14908 ((E->getOpcode() == Instruction::Load ||
14909 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14912 return isa<LoadInst>(V) && getTreeEntry(V);
14914 E->isAltShuffle() ||
14915 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14917 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14919 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14921 if (!GatherShuffles.
empty()) {
14922 if (std::optional<ResTy> Delayed =
14923 ShuffleBuilder.needToDelay(E, Entries)) {
14925 PostponedGathers.
insert(E);
14930 if (GatherShuffles.
size() == 1 &&
14932 Entries.front().front()->isSame(E->Scalars)) {
14935 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14938 Mask.resize(E->Scalars.size());
14939 const TreeEntry *FrontTE = Entries.front().front();
14940 if (FrontTE->ReorderIndices.empty() &&
14941 ((FrontTE->ReuseShuffleIndices.empty() &&
14942 E->Scalars.size() == FrontTE->Scalars.size()) ||
14943 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14944 std::iota(
Mask.begin(),
Mask.end(), 0);
14947 if (isa<PoisonValue>(V)) {
14951 Mask[
I] = FrontTE->findLaneForValue(V);
14954 ShuffleBuilder.add(*FrontTE, Mask);
14956 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14960 if (GatheredScalars.size() != VF &&
14962 return any_of(TEs, [&](
const TreeEntry *TE) {
14963 return TE->getVectorFactor() == VF;
14966 GatheredScalars.append(VF - GatheredScalars.size(),
14970 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14978 bool IsRootPoison) {
14981 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14988 int NumNonConsts = 0;
14991 if (isa<UndefValue>(V)) {
14992 if (!isa<PoisonValue>(V)) {
15007 Scalars.
front() = OrigV;
15010 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
15011 Scalars[Res.first->second] = OrigV;
15012 ReuseMask[
I] = Res.first->second;
15015 if (NumNonConsts == 1) {
15020 if (!UndefPos.
empty() && UndefPos.
front() == 0)
15023 ReuseMask[SinglePos] = SinglePos;
15024 }
else if (!UndefPos.
empty() && IsSplat) {
15029 return !isa<UndefValue>(V) &&
15031 (E->UserTreeIndices.size() == 1 &&
15035 return E->UserTreeIndices.front().EdgeIdx !=
15036 U.getOperandNo() &&
15038 E->UserTreeIndices.front().UserTE->Scalars,
15042 if (It != Scalars.
end()) {
15044 int Pos = std::distance(Scalars.
begin(), It);
15045 for (
int I : UndefPos) {
15047 ReuseMask[
I] = Pos;
15056 for (
int I : UndefPos) {
15058 if (isa<UndefValue>(Scalars[
I]))
15065 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15066 bool IsNonPoisoned =
true;
15067 bool IsUsedInExpr =
true;
15068 Value *Vec1 =
nullptr;
15069 if (!ExtractShuffles.
empty()) {
15073 Value *Vec2 =
nullptr;
15074 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15078 if (UseVecBaseAsInput) {
15079 Vec1 = ExtractVecBase;
15081 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15084 if (isa<UndefValue>(E->Scalars[
I]))
15086 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15087 Value *VecOp = EI->getVectorOperand();
15088 if (
const auto *TE = getTreeEntry(VecOp))
15089 if (
TE->VectorizedValue)
15090 VecOp =
TE->VectorizedValue;
15093 }
else if (Vec1 != VecOp) {
15094 assert((!Vec2 || Vec2 == VecOp) &&
15095 "Expected only 1 or 2 vectors shuffle.");
15101 IsUsedInExpr =
false;
15104 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15107 IsUsedInExpr &= FindReusedSplat(
15109 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15110 ExtractMask.size(), IsNotPoisonedVec);
15111 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15112 IsNonPoisoned &= IsNotPoisonedVec;
15114 IsUsedInExpr =
false;
15119 if (!GatherShuffles.
empty()) {
15122 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15125 "No shuffles with empty entries list expected.");
15129 "Expected shuffle of 1 or 2 entries.");
15133 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15134 if (TEs.
size() == 1) {
15135 bool IsNotPoisonedVec =
15136 TEs.
front()->VectorizedValue
15140 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15141 SliceSize, IsNotPoisonedVec);
15142 ShuffleBuilder.add(*TEs.
front(), VecMask);
15143 IsNonPoisoned &= IsNotPoisonedVec;
15145 IsUsedInExpr =
false;
15146 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15147 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15158 int EMSz = ExtractMask.size();
15159 int MSz =
Mask.size();
15162 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15163 bool IsIdentityShuffle =
15164 ((UseVecBaseAsInput ||
15166 [](
const std::optional<TTI::ShuffleKind> &SK) {
15170 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15172 (!GatherShuffles.
empty() &&
15174 [](
const std::optional<TTI::ShuffleKind> &SK) {
15178 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15180 bool EnoughConstsForShuffle =
15184 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15188 return isa<Constant>(V) && !isa<UndefValue>(V);
15190 (!IsIdentityShuffle ||
15191 (GatheredScalars.size() == 2 &&
15193 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15195 return isa<Constant>(V) && !isa<PoisonValue>(V);
15199 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15200 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15206 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15208 TryPackScalars(GatheredScalars, BVMask,
true);
15209 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15210 ShuffleBuilder.add(BV, BVMask);
15213 return isa<PoisonValue>(V) ||
15214 (IsSingleShuffle && ((IsIdentityShuffle &&
15215 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15217 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15220 Res = ShuffleBuilder.finalize(
15221 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15223 TryPackScalars(NonConstants, Mask,
false);
15224 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15229 TryPackScalars(GatheredScalars, ReuseMask,
true);
15230 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15231 ShuffleBuilder.add(BV, ReuseMask);
15232 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15237 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15238 if (!isa<PoisonValue>(V))
15241 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15242 ShuffleBuilder.add(BV, Mask);
15243 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15248 Res = ShuffleBuilder.createFreeze(Res);
15252Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15253 bool PostponedPHIs) {
15254 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15256 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15264 for (
Value *V : VL)
15265 if (isa<Instruction>(V))
15273 if (E->VectorizedValue &&
15274 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15275 E->isAltShuffle())) {
15276 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15277 return E->VectorizedValue;
15280 Value *
V = E->Scalars.front();
15281 Type *ScalarTy =
V->getType();
15282 if (!isa<CmpInst>(V))
15284 auto It = MinBWs.
find(E);
15285 if (It != MinBWs.
end()) {
15286 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15292 if (E->isGather()) {
15294 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15295 setInsertPointAfterBundle(E);
15296 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15297 E->VectorizedValue = Vec;
15301 bool IsReverseOrder =
15302 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15303 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15304 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15305 if (E->getOpcode() == Instruction::Store &&
15306 E->State == TreeEntry::Vectorize) {
15308 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15309 E->ReorderIndices.size());
15310 ShuffleBuilder.add(V, Mask);
15311 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15312 ShuffleBuilder.addOrdered(V, {});
15314 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15317 E->CombinedEntriesWithIndices.size());
15319 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15320 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15323 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15324 "Expected either combined subnodes or reordering");
15325 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15328 assert(!E->isGather() &&
"Unhandled state");
15329 unsigned ShuffleOrOp =
15330 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15332 auto GetOperandSignedness = [&](
unsigned Idx) {
15333 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15334 bool IsSigned =
false;
15335 auto It = MinBWs.
find(OpE);
15336 if (It != MinBWs.
end())
15337 IsSigned = It->second.second;
15340 if (isa<PoisonValue>(V))
15342 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15346 switch (ShuffleOrOp) {
15347 case Instruction::PHI: {
15348 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15349 E != VectorizableTree.front().get() ||
15350 !E->UserTreeIndices.empty()) &&
15351 "PHI reordering is free.");
15352 if (PostponedPHIs && E->VectorizedValue)
15353 return E->VectorizedValue;
15354 auto *PH = cast<PHINode>(VL0);
15356 PH->getParent()->getFirstNonPHIIt());
15358 if (PostponedPHIs || !E->VectorizedValue) {
15365 PH->getParent()->getFirstInsertionPt());
15368 V = FinalShuffle(V, E);
15370 E->VectorizedValue =
V;
15374 PHINode *NewPhi = cast<PHINode>(E->PHI);
15383 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15389 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15393 if (!VisitedBBs.
insert(IBB).second) {
15400 Value *Vec = vectorizeOperand(E,
I,
true);
15401 if (VecTy != Vec->
getType()) {
15403 MinBWs.
contains(getOperandEntry(E,
I))) &&
15404 "Expected item in MinBWs.");
15405 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15411 "Invalid number of incoming values");
15412 assert(E->VectorizedValue &&
"Expected vectorized value.");
15413 return E->VectorizedValue;
15416 case Instruction::ExtractElement: {
15417 Value *
V = E->getSingleOperand(0);
15418 if (
const TreeEntry *TE = getTreeEntry(V))
15419 V =
TE->VectorizedValue;
15420 setInsertPointAfterBundle(E);
15421 V = FinalShuffle(V, E);
15422 E->VectorizedValue =
V;
15425 case Instruction::ExtractValue: {
15426 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15431 NewV = FinalShuffle(NewV, E);
15432 E->VectorizedValue = NewV;
15435 case Instruction::InsertElement: {
15436 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15438 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15440 Type *ScalarTy =
Op.front()->getType();
15441 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15443 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15444 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15449 cast<FixedVectorType>(
V->getType())->getNumElements()),
15454 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15455 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15457 const unsigned NumElts =
15458 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15459 const unsigned NumScalars = E->Scalars.size();
15462 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15466 if (!E->ReorderIndices.empty()) {
15471 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15474 bool IsIdentity =
true;
15476 Mask.swap(PrevMask);
15477 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15480 IsIdentity &= InsertIdx -
Offset ==
I;
15483 if (!IsIdentity || NumElts != NumScalars) {
15485 bool IsVNonPoisonous =
15488 if (NumElts != NumScalars &&
Offset == 0) {
15497 InsertMask[*InsertIdx] = *InsertIdx;
15498 if (!
Ins->hasOneUse())
15500 Ins = dyn_cast_or_null<InsertElementInst>(
15501 Ins->getUniqueUndroppableUser());
15504 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15506 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15509 if (!IsFirstPoison.
all()) {
15511 for (
unsigned I = 0;
I < NumElts;
I++) {
15513 IsFirstUndef.
test(
I)) {
15514 if (IsVNonPoisonous) {
15515 InsertMask[
I] =
I < NumScalars ?
I : 0;
15520 if (
Idx >= NumScalars)
15521 Idx = NumScalars - 1;
15522 InsertMask[
I] = NumScalars +
Idx;
15536 if (
auto *
I = dyn_cast<Instruction>(V)) {
15537 GatherShuffleExtractSeq.
insert(
I);
15538 CSEBlocks.
insert(
I->getParent());
15543 for (
unsigned I = 0;
I < NumElts;
I++) {
15548 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15551 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15552 NumElts != NumScalars) {
15553 if (IsFirstUndef.
all()) {
15556 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15557 if (!IsFirstPoison.
all()) {
15558 for (
unsigned I = 0;
I < NumElts;
I++) {
15560 InsertMask[
I] =
I + NumElts;
15567 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15568 if (
auto *
I = dyn_cast<Instruction>(V)) {
15569 GatherShuffleExtractSeq.
insert(
I);
15570 CSEBlocks.
insert(
I->getParent());
15575 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15576 for (
unsigned I = 0;
I < NumElts;
I++) {
15580 InsertMask[
I] += NumElts;
15583 FirstInsert->getOperand(0), V, InsertMask,
15584 cast<Instruction>(E->Scalars.back())->getName());
15585 if (
auto *
I = dyn_cast<Instruction>(V)) {
15586 GatherShuffleExtractSeq.
insert(
I);
15587 CSEBlocks.
insert(
I->getParent());
15592 ++NumVectorInstructions;
15593 E->VectorizedValue =
V;
15596 case Instruction::ZExt:
15597 case Instruction::SExt:
15598 case Instruction::FPToUI:
15599 case Instruction::FPToSI:
15600 case Instruction::FPExt:
15601 case Instruction::PtrToInt:
15602 case Instruction::IntToPtr:
15603 case Instruction::SIToFP:
15604 case Instruction::UIToFP:
15605 case Instruction::Trunc:
15606 case Instruction::FPTrunc:
15607 case Instruction::BitCast: {
15608 setInsertPointAfterBundle(E);
15610 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15611 if (E->VectorizedValue) {
15612 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15613 return E->VectorizedValue;
15616 auto *CI = cast<CastInst>(VL0);
15618 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15619 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15621 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15624 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15625 if (SrcIt != MinBWs.
end())
15626 SrcBWSz = SrcIt->second.first;
15628 if (BWSz == SrcBWSz) {
15629 VecOpcode = Instruction::BitCast;
15630 }
else if (BWSz < SrcBWSz) {
15631 VecOpcode = Instruction::Trunc;
15632 }
else if (It != MinBWs.
end()) {
15633 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15634 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15635 }
else if (SrcIt != MinBWs.
end()) {
15636 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15638 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15640 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15641 !SrcIt->second.second) {
15642 VecOpcode = Instruction::UIToFP;
15644 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15646 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15647 V = FinalShuffle(V, E);
15649 E->VectorizedValue =
V;
15650 ++NumVectorInstructions;
15653 case Instruction::FCmp:
15654 case Instruction::ICmp: {
15655 setInsertPointAfterBundle(E);
15657 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15658 if (E->VectorizedValue) {
15659 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15660 return E->VectorizedValue;
15662 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15663 if (E->VectorizedValue) {
15664 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15665 return E->VectorizedValue;
15667 if (
L->getType() !=
R->getType()) {
15669 getOperandEntry(E, 1)->
isGather() ||
15670 MinBWs.
contains(getOperandEntry(E, 0)) ||
15671 MinBWs.
contains(getOperandEntry(E, 1))) &&
15672 "Expected item in MinBWs.");
15673 if (cast<VectorType>(
L->getType())
15675 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15677 ->getIntegerBitWidth()) {
15678 Type *CastTy =
R->getType();
15681 Type *CastTy =
L->getType();
15689 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15690 ICmp->setSameSign(
false);
15692 VecTy = cast<FixedVectorType>(
V->getType());
15693 V = FinalShuffle(V, E);
15695 E->VectorizedValue =
V;
15696 ++NumVectorInstructions;
15699 case Instruction::Select: {
15700 setInsertPointAfterBundle(E);
15702 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15703 if (E->VectorizedValue) {
15704 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15705 return E->VectorizedValue;
15707 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15708 if (E->VectorizedValue) {
15709 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15710 return E->VectorizedValue;
15712 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15713 if (E->VectorizedValue) {
15714 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15715 return E->VectorizedValue;
15719 getOperandEntry(E, 2)->
isGather() ||
15720 MinBWs.
contains(getOperandEntry(E, 1)) ||
15721 MinBWs.
contains(getOperandEntry(E, 2))) &&
15722 "Expected item in MinBWs.");
15723 if (True->
getType() != VecTy)
15724 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15725 if (False->
getType() != VecTy)
15726 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15731 assert(TrueNumElements >= CondNumElements &&
15732 TrueNumElements % CondNumElements == 0 &&
15733 "Cannot vectorize Instruction::Select");
15735 "Cannot vectorize Instruction::Select");
15736 if (CondNumElements != TrueNumElements) {
15744 "Cannot vectorize Instruction::Select");
15746 V = FinalShuffle(V, E);
15748 E->VectorizedValue =
V;
15749 ++NumVectorInstructions;
15752 case Instruction::FNeg: {
15753 setInsertPointAfterBundle(E);
15755 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15757 if (E->VectorizedValue) {
15758 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15759 return E->VectorizedValue;
15765 if (
auto *
I = dyn_cast<Instruction>(V))
15768 V = FinalShuffle(V, E);
15770 E->VectorizedValue =
V;
15771 ++NumVectorInstructions;
15775 case Instruction::Freeze: {
15776 setInsertPointAfterBundle(E);
15778 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15780 if (E->VectorizedValue) {
15781 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15782 return E->VectorizedValue;
15785 if (
Op->getType() != VecTy) {
15787 MinBWs.
contains(getOperandEntry(E, 0))) &&
15788 "Expected item in MinBWs.");
15792 V = FinalShuffle(V, E);
15794 E->VectorizedValue =
V;
15795 ++NumVectorInstructions;
15799 case Instruction::Add:
15800 case Instruction::FAdd:
15801 case Instruction::Sub:
15802 case Instruction::FSub:
15803 case Instruction::Mul:
15804 case Instruction::FMul:
15805 case Instruction::UDiv:
15806 case Instruction::SDiv:
15807 case Instruction::FDiv:
15808 case Instruction::URem:
15809 case Instruction::SRem:
15810 case Instruction::FRem:
15811 case Instruction::Shl:
15812 case Instruction::LShr:
15813 case Instruction::AShr:
15814 case Instruction::And:
15815 case Instruction::Or:
15816 case Instruction::Xor: {
15817 setInsertPointAfterBundle(E);
15819 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15820 if (E->VectorizedValue) {
15821 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15822 return E->VectorizedValue;
15824 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15825 if (E->VectorizedValue) {
15826 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15827 return E->VectorizedValue;
15829 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15830 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15833 auto *CI = dyn_cast<ConstantInt>(
Op);
15834 return CI && CI->getValue().countr_one() >= It->second.first;
15836 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15837 E->VectorizedValue =
V;
15838 ++NumVectorInstructions;
15845 getOperandEntry(E, 1)->
isGather() ||
15846 MinBWs.
contains(getOperandEntry(E, 0)) ||
15847 MinBWs.
contains(getOperandEntry(E, 1))) &&
15848 "Expected item in MinBWs.");
15859 if (
auto *
I = dyn_cast<Instruction>(V)) {
15862 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15864 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15866 I->setHasNoUnsignedWrap(
false);
15869 V = FinalShuffle(V, E);
15871 E->VectorizedValue =
V;
15872 ++NumVectorInstructions;
15876 case Instruction::Load: {
15879 setInsertPointAfterBundle(E);
15881 LoadInst *LI = cast<LoadInst>(VL0);
15884 if (E->State == TreeEntry::Vectorize) {
15886 }
else if (E->State == TreeEntry::StridedVectorize) {
15887 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15888 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15889 PO = IsReverseOrder ? PtrN : Ptr0;
15895 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15897 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15898 DL->getTypeAllocSize(ScalarTy));
15902 return cast<LoadInst>(V)->getPointerOperand();
15905 std::optional<Value *> Stride =
15914 (IsReverseOrder ? -1 : 1) *
15915 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15917 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15919 Intrinsic::experimental_vp_strided_load,
15920 {VecTy, PO->
getType(), StrideTy},
15922 Builder.
getInt32(E->Scalars.size())});
15928 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15929 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15930 if (E->VectorizedValue) {
15931 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15932 return E->VectorizedValue;
15934 if (isa<FixedVectorType>(ScalarTy)) {
15938 unsigned ScalarTyNumElements =
15939 cast<FixedVectorType>(ScalarTy)->getNumElements();
15940 unsigned VecTyNumElements =
15941 cast<FixedVectorType>(VecTy)->getNumElements();
15942 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15943 "Cannot expand getelementptr.");
15944 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15947 return Builder.getInt64(I % ScalarTyNumElements);
15956 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15961 V = FinalShuffle(V, E);
15962 E->VectorizedValue =
V;
15963 ++NumVectorInstructions;
15966 case Instruction::Store: {
15967 auto *
SI = cast<StoreInst>(VL0);
15969 setInsertPointAfterBundle(E);
15971 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15972 if (VecValue->
getType() != VecTy)
15974 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15975 VecValue = FinalShuffle(VecValue, E);
15979 if (E->State == TreeEntry::Vectorize) {
15982 assert(E->State == TreeEntry::StridedVectorize &&
15983 "Expected either strided or consecutive stores.");
15984 if (!E->ReorderIndices.empty()) {
15985 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15986 Ptr =
SI->getPointerOperand();
15988 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15989 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15991 Intrinsic::experimental_vp_strided_store,
15992 {VecTy,
Ptr->getType(), StrideTy},
15995 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15997 Builder.
getInt32(E->Scalars.size())});
16006 E->VectorizedValue =
V;
16007 ++NumVectorInstructions;
16010 case Instruction::GetElementPtr: {
16011 auto *GEP0 = cast<GetElementPtrInst>(VL0);
16012 setInsertPointAfterBundle(E);
16014 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16015 if (E->VectorizedValue) {
16016 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16017 return E->VectorizedValue;
16021 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
16022 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16023 if (E->VectorizedValue) {
16024 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16025 return E->VectorizedValue;
16030 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16031 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
16033 for (
Value *V : E->Scalars) {
16034 if (isa<GetElementPtrInst>(V))
16040 V = FinalShuffle(V, E);
16042 E->VectorizedValue =
V;
16043 ++NumVectorInstructions;
16047 case Instruction::Call: {
16048 CallInst *CI = cast<CallInst>(VL0);
16049 setInsertPointAfterBundle(E);
16055 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16058 VecCallCosts.first <= VecCallCosts.second;
16060 Value *ScalarArg =
nullptr;
16066 auto *CEI = cast<CallInst>(VL0);
16067 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16072 ScalarArg = CEI->getArgOperand(
I);
16075 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16076 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16084 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16085 if (E->VectorizedValue) {
16086 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16087 return E->VectorizedValue;
16089 ScalarArg = CEI->getArgOperand(
I);
16090 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16092 It == MinBWs.
end()) {
16095 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16096 }
else if (It != MinBWs.
end()) {
16097 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16106 if (!UseIntrinsic) {
16122 V = FinalShuffle(V, E);
16124 E->VectorizedValue =
V;
16125 ++NumVectorInstructions;
16128 case Instruction::ShuffleVector: {
16130 if (
SLPReVec && !E->isAltShuffle()) {
16131 setInsertPointAfterBundle(E);
16132 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16133 if (E->VectorizedValue) {
16134 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16135 return E->VectorizedValue;
16138 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16139 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16140 "Not supported shufflevector usage.");
16143 return SVSrc->getShuffleMask()[Mask];
16150 if (
auto *
I = dyn_cast<Instruction>(V))
16152 V = FinalShuffle(V, E);
16154 assert(E->isAltShuffle() &&
16159 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16160 "Invalid Shuffle Vector Operand");
16164 setInsertPointAfterBundle(E);
16165 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16166 if (E->VectorizedValue) {
16167 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16168 return E->VectorizedValue;
16170 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16172 setInsertPointAfterBundle(E);
16173 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16175 if (E->VectorizedValue) {
16176 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16177 return E->VectorizedValue;
16184 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16185 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16186 MinBWs.
contains(getOperandEntry(E, 0)) ||
16187 MinBWs.
contains(getOperandEntry(E, 1))) &&
16188 "Expected item in MinBWs.");
16189 Type *CastTy = VecTy;
16193 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16195 ->getIntegerBitWidth())
16212 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16213 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16214 auto *AltCI = cast<CmpInst>(E->getAltOp());
16216 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16219 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16220 cast<VectorType>(
LHS->
getType())->getElementType());
16221 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16222 if (BWSz <= SrcBWSz) {
16223 if (BWSz < SrcBWSz)
16226 "Expected same type as operand.");
16227 if (
auto *
I = dyn_cast<Instruction>(LHS))
16229 LHS = FinalShuffle(LHS, E);
16230 E->VectorizedValue =
LHS;
16231 ++NumVectorInstructions;
16242 for (
Value *V : {V0, V1}) {
16243 if (
auto *
I = dyn_cast<Instruction>(V)) {
16244 GatherShuffleExtractSeq.
insert(
I);
16245 CSEBlocks.
insert(
I->getParent());
16254 E->buildAltOpShuffleMask(
16256 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16260 Mask, &OpScalars, &AltScalars);
16264 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16266 if (
auto *
I = dyn_cast<Instruction>(Vec);
16267 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16269 if (isa<PoisonValue>(V))
16271 auto *IV = cast<Instruction>(V);
16272 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16274 I->setHasNoUnsignedWrap(
false);
16276 DropNuwFlag(V0, E->getOpcode());
16277 DropNuwFlag(V1, E->getAltOpcode());
16279 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16284 if (
auto *
I = dyn_cast<Instruction>(V)) {
16286 GatherShuffleExtractSeq.
insert(
I);
16287 CSEBlocks.
insert(
I->getParent());
16291 E->VectorizedValue =
V;
16292 ++NumVectorInstructions;
16311 for (
auto &BSIter : BlocksSchedules) {
16312 scheduleBlock(BSIter.second.get());
16316 EntryToLastInstruction.
clear();
16326 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16327 if (GatheredLoadsEntriesFirst.has_value() &&
16328 TE->Idx >= *GatheredLoadsEntriesFirst &&
16329 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16330 assert((!TE->UserTreeIndices.empty() ||
16331 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16332 "Expected gathered load node.");
16338 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16339 if (TE->State == TreeEntry::Vectorize &&
16340 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16341 TE->VectorizedValue)
16347 for (
const TreeEntry *E : PostponedNodes) {
16348 auto *TE =
const_cast<TreeEntry *
>(E);
16349 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16350 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16351 TE->UserTreeIndices.front().EdgeIdx)) &&
16352 VecTE->isSame(TE->Scalars))
16356 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16357 TE->VectorizedValue =
nullptr;
16359 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16368 if (isa<PHINode>(UserI)) {
16371 for (
User *U : PrevVec->users()) {
16374 auto *UI = dyn_cast<Instruction>(U);
16375 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16377 if (UI->comesBefore(InsertPt))
16386 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16391 if (Vec->
getType() != PrevVec->getType()) {
16393 PrevVec->getType()->isIntOrIntVectorTy() &&
16394 "Expected integer vector types only.");
16395 std::optional<bool> IsSigned;
16396 for (
Value *V : TE->Scalars) {
16397 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16398 auto It = MinBWs.
find(BaseTE);
16399 if (It != MinBWs.
end()) {
16400 IsSigned = IsSigned.value_or(
false) || It->second.second;
16404 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16405 auto It = MinBWs.
find(MNTE);
16406 if (It != MinBWs.
end()) {
16407 IsSigned = IsSigned.value_or(
false) || It->second.second;
16412 if (IsSigned.value_or(
false))
16415 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16416 auto It = MinBWs.
find(BVE);
16417 if (It != MinBWs.
end()) {
16418 IsSigned = IsSigned.value_or(
false) || It->second.second;
16423 if (IsSigned.value_or(
false))
16425 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16427 IsSigned.value_or(
false) ||
16431 if (IsSigned.value_or(
false))
16435 if (IsSigned.value_or(
false)) {
16437 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16438 if (It != MinBWs.
end())
16439 IsSigned = It->second.second;
16442 "Expected user node or perfect diamond match in MinBWs.");
16446 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16449 auto It = PostponedValues.
find(PrevVec);
16450 if (It != PostponedValues.
end()) {
16451 for (TreeEntry *VTE : It->getSecond())
16452 VTE->VectorizedValue = Vec;
16472 for (
const auto &ExternalUse : ExternalUses) {
16473 Value *Scalar = ExternalUse.Scalar;
16480 TreeEntry *E = getTreeEntry(Scalar);
16481 assert(E &&
"Invalid scalar");
16482 assert(!E->isGather() &&
"Extracting from a gather list");
16484 if (E->getOpcode() == Instruction::GetElementPtr &&
16485 !isa<GetElementPtrInst>(Scalar))
16488 Value *Vec = E->VectorizedValue;
16489 assert(Vec &&
"Can't find vectorizable value");
16492 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16493 if (Scalar->getType() != Vec->
getType()) {
16494 Value *Ex =
nullptr;
16495 Value *ExV =
nullptr;
16496 auto *Inst = dyn_cast<Instruction>(Scalar);
16497 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16498 auto It = ScalarToEEs.
find(Scalar);
16499 if (It != ScalarToEEs.
end()) {
16502 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16504 if (EEIt != It->second.end()) {
16505 Value *PrevV = EEIt->second.first;
16506 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16507 I && !ReplaceInst &&
16512 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16516 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16524 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16525 IgnoredExtracts.
insert(EE);
16528 auto *CloneInst = Inst->clone();
16529 CloneInst->insertBefore(Inst);
16530 if (Inst->hasName())
16534 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16535 ES && isa<Instruction>(Vec)) {
16536 Value *V = ES->getVectorOperand();
16537 auto *IVec = cast<Instruction>(Vec);
16538 if (
const TreeEntry *ETE = getTreeEntry(V))
16539 V = ETE->VectorizedValue;
16540 if (
auto *
IV = dyn_cast<Instruction>(V);
16541 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16542 IV->comesBefore(IVec))
16546 }
else if (
auto *VecTy =
16547 dyn_cast<FixedVectorType>(Scalar->getType())) {
16556 Vec, Builder.
getInt64(ExternalUse.Lane * VecTyNumElements));
16563 if (Scalar->getType() != Ex->
getType())
16565 Ex, Scalar->getType(),
16567 auto *
I = dyn_cast<Instruction>(Ex);
16569 : &
F->getEntryBlock(),
16570 std::make_pair(Ex, ExV));
16574 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16576 GatherShuffleExtractSeq.
insert(ExI);
16577 CSEBlocks.
insert(ExI->getParent());
16581 assert(isa<FixedVectorType>(Scalar->getType()) &&
16582 isa<InsertElementInst>(Scalar) &&
16583 "In-tree scalar of vector type is not insertelement?");
16584 auto *IE = cast<InsertElementInst>(Scalar);
16592 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16596 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16599 if (ExternalUsesAsOriginalScalar.contains(U))
16601 TreeEntry *UseEntry = getTreeEntry(U);
16603 (UseEntry->State == TreeEntry::Vectorize ||
16605 TreeEntry::StridedVectorize) &&
16606 (E->State == TreeEntry::Vectorize ||
16607 E->State == TreeEntry::StridedVectorize) &&
16608 doesInTreeUserNeedToExtract(
16609 Scalar, getRootEntryInstruction(*UseEntry),
16612 "Scalar with nullptr User must be registered in "
16613 "ExternallyUsedValues map or remain as scalar in vectorized "
16615 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16616 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16617 if (
PHI->getParent()->isLandingPad())
16621 PHI->getParent()->getLandingPadInst()->getIterator()));
16624 PHI->getParent()->getFirstNonPHIIt());
16627 std::next(VecI->getIterator()));
16632 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16634 if (Scalar != NewInst) {
16635 assert((!isa<ExtractElementInst>(Scalar) ||
16636 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16637 "Extractelements should not be replaced.");
16638 Scalar->replaceAllUsesWith(NewInst);
16643 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16646 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16647 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16648 if (!UsedInserts.
insert(VU).second)
16651 auto BWIt = MinBWs.
find(E);
16653 auto *ScalarTy = FTy->getElementType();
16654 auto Key = std::make_pair(Vec, ScalarTy);
16655 auto VecIt = VectorCasts.
find(Key);
16656 if (VecIt == VectorCasts.
end()) {
16658 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16659 if (IVec->getParent()->isLandingPad())
16661 std::next(IVec->getParent()
16662 ->getLandingPadInst()
16666 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16667 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16674 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16675 BWIt->second.second);
16678 Vec = VecIt->second;
16685 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16692 unsigned Idx = *InsertIdx;
16693 if (It == ShuffledInserts.
end()) {
16695 It = std::next(ShuffledInserts.
begin(),
16696 ShuffledInserts.
size() - 1);
16701 Mask[
Idx] = ExternalUse.Lane;
16702 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16711 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16713 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16714 if (PH->getIncomingValue(
I) == Scalar) {
16716 PH->getIncomingBlock(
I)->getTerminator();
16717 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16719 std::next(VecI->getIterator()));
16723 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16724 PH->setOperand(
I, NewInst);
16729 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16734 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16744 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16745 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16747 CombinedMask1[
I] = Mask[
I];
16749 CombinedMask2[
I] = Mask[
I] - VF;
16752 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16753 ShuffleBuilder.
add(V1, CombinedMask1);
16755 ShuffleBuilder.
add(V2, CombinedMask2);
16756 return ShuffleBuilder.
finalize({}, {}, {});
16760 bool ForSingleMask) {
16761 unsigned VF = Mask.size();
16762 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16764 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16765 Vec = CreateShuffle(Vec,
nullptr, Mask);
16766 return std::make_pair(Vec,
true);
16768 if (!ForSingleMask) {
16770 for (
unsigned I = 0;
I < VF; ++
I) {
16772 ResizeMask[Mask[
I]] = Mask[
I];
16774 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16778 return std::make_pair(Vec,
false);
16782 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16788 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16789 Value *NewInst = performExtractsShuffleAction<Value>(
16793 return cast<VectorType>(Vec->getType())
16794 ->getElementCount()
16795 .getKnownMinValue();
16800 assert((Vals.size() == 1 || Vals.size() == 2) &&
16801 "Expected exactly 1 or 2 input values.");
16802 if (Vals.size() == 1) {
16805 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16806 ->getNumElements() ||
16807 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16808 return CreateShuffle(Vals.front(), nullptr, Mask);
16809 return Vals.front();
16811 return CreateShuffle(Vals.
front() ? Vals.
front()
16813 Vals.
back(), Mask);
16815 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16818 if (It != ShuffledInserts[
I].InsertElements.
rend())
16821 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16822 assert(
II &&
"Must be an insertelement instruction.");
16827 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16830 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16831 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16832 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16833 II->moveAfter(NewI);
16836 LastInsert->replaceAllUsesWith(NewInst);
16838 IE->replaceUsesOfWith(IE->getOperand(0),
16840 IE->replaceUsesOfWith(IE->getOperand(1),
16844 CSEBlocks.
insert(LastInsert->getParent());
16849 for (
auto &TEPtr : VectorizableTree) {
16850 TreeEntry *Entry = TEPtr.get();
16853 if (Entry->isGather())
16856 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16859 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16860 Value *Scalar = Entry->Scalars[Lane];
16862 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16863 !isa<GetElementPtrInst>(Scalar))
16865 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16866 EE && IgnoredExtracts.contains(EE))
16868 if (isa<PoisonValue>(Scalar))
16871 Type *Ty = Scalar->getType();
16873 for (
User *U : Scalar->users()) {
16877 assert((getTreeEntry(U) ||
16878 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16879 (isa_and_nonnull<Instruction>(U) &&
16880 isDeleted(cast<Instruction>(U)))) &&
16881 "Deleting out-of-tree value");
16885 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16886 auto *
I = cast<Instruction>(Scalar);
16893 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16894 V->mergeDIAssignID(RemovedInsts);
16897 if (UserIgnoreList) {
16899 const TreeEntry *
IE = getTreeEntry(
I);
16900 if (
IE->Idx != 0 &&
16901 !(VectorizableTree.front()->isGather() &&
16902 !
IE->UserTreeIndices.empty() &&
16903 (ValueToGatherNodes.lookup(
I).contains(
16904 VectorizableTree.front().get()) ||
16906 [&](
const EdgeInfo &EI) {
16907 return EI.UserTE == VectorizableTree.front().get() &&
16908 EI.EdgeIdx == UINT_MAX;
16910 !(GatheredLoadsEntriesFirst.has_value() &&
16911 IE->Idx >= *GatheredLoadsEntriesFirst &&
16912 VectorizableTree.front()->isGather() &&
16918 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16919 (match(U.getUser(), m_LogicalAnd()) ||
16920 match(U.getUser(), m_LogicalOr())) &&
16921 U.getOperandNo() == 0;
16922 if (IsPoisoningLogicalOp) {
16923 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16926 return UserIgnoreList->contains(
U.getUser());
16938 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16941 InstrElementSize.
clear();
16943 const TreeEntry &RootTE = *VectorizableTree.front();
16944 Value *Vec = RootTE.VectorizedValue;
16945 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16946 It != MinBWs.end() &&
16947 ReductionBitWidth != It->second.first) {
16950 ReductionRoot->getIterator());
16954 cast<VectorType>(Vec->
getType())->getElementCount()),
16955 It->second.second);
16962 <<
" gather sequences instructions.\n");
16969 Loop *L = LI->getLoopFor(
I->getParent());
16974 BasicBlock *PreHeader = L->getLoopPreheader();
16982 auto *OpI = dyn_cast<Instruction>(V);
16983 return OpI && L->contains(OpI);
16989 CSEBlocks.
insert(PreHeader);
17004 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
17005 "Different nodes should have different DFS numbers");
17006 return A->getDFSNumIn() <
B->getDFSNumIn();
17017 if (I1->getType() != I2->getType())
17019 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17020 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17022 return I1->isIdenticalTo(I2);
17023 if (SI1->isIdenticalTo(SI2))
17025 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
17026 if (SI1->getOperand(
I) != SI2->getOperand(
I))
17029 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17033 unsigned LastUndefsCnt = 0;
17034 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
17040 NewMask[
I] != SM1[
I])
17043 NewMask[
I] = SM1[
I];
17047 return SM1.
size() - LastUndefsCnt > 1 &&
17051 SM1.
size() - LastUndefsCnt));
17057 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17060 "Worklist not sorted properly!");
17066 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17067 !GatherShuffleExtractSeq.contains(&In))
17072 bool Replaced =
false;
17075 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17076 DT->
dominates(V->getParent(), In.getParent())) {
17077 In.replaceAllUsesWith(V);
17079 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17080 if (!NewMask.
empty())
17081 SI->setShuffleMask(NewMask);
17085 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17086 GatherShuffleExtractSeq.contains(V) &&
17087 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17088 DT->
dominates(In.getParent(), V->getParent())) {
17090 V->replaceAllUsesWith(&In);
17092 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17093 if (!NewMask.
empty())
17094 SI->setShuffleMask(NewMask);
17102 Visited.push_back(&In);
17107 GatherShuffleExtractSeq.clear();
17110BoUpSLP::ScheduleData *
17112 ScheduleData *Bundle =
nullptr;
17113 ScheduleData *PrevInBundle =
nullptr;
17114 for (
Value *V : VL) {
17117 ScheduleData *BundleMember = getScheduleData(V);
17119 "no ScheduleData for bundle member "
17120 "(maybe not in same basic block)");
17121 assert(BundleMember->isSchedulingEntity() &&
17122 "bundle member already part of other bundle");
17123 if (PrevInBundle) {
17124 PrevInBundle->NextInBundle = BundleMember;
17126 Bundle = BundleMember;
17130 BundleMember->FirstInBundle = Bundle;
17131 PrevInBundle = BundleMember;
17133 assert(Bundle &&
"Failed to find schedule bundle");
17139std::optional<BoUpSLP::ScheduleData *>
17141 const InstructionsState &S) {
17144 if (isa<PHINode>(S.getMainOp()) ||
17150 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17152 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17153 ScheduleData *Bundle) {
17159 if (ScheduleEnd != OldScheduleEnd) {
17160 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17161 if (ScheduleData *SD = getScheduleData(
I))
17162 SD->clearDependencies();
17167 <<
" in block " << BB->
getName() <<
"\n");
17168 calculateDependencies(Bundle,
true, SLP);
17173 initialFillReadyList(ReadyInsts);
17180 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17181 !ReadyInsts.empty()) {
17182 ScheduleData *Picked = ReadyInsts.pop_back_val();
17183 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17184 "must be ready to schedule");
17185 schedule(Picked, ReadyInsts);
17191 for (
Value *V : VL) {
17194 if (!extendSchedulingRegion(V, S)) {
17201 TryScheduleBundleImpl(
false,
nullptr);
17202 return std::nullopt;
17206 bool ReSchedule =
false;
17207 for (
Value *V : VL) {
17210 ScheduleData *BundleMember = getScheduleData(V);
17212 "no ScheduleData for bundle member (maybe not in same basic block)");
17216 ReadyInsts.remove(BundleMember);
17218 if (!BundleMember->IsScheduled)
17223 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17224 <<
" was already scheduled\n");
17228 auto *Bundle = buildBundle(VL);
17229 TryScheduleBundleImpl(ReSchedule, Bundle);
17230 if (!Bundle->isReady()) {
17231 cancelScheduling(VL, S.getMainOp());
17232 return std::nullopt;
17245 ScheduleData *Bundle = getScheduleData(OpValue);
17246 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17247 assert(!Bundle->IsScheduled &&
17248 "Can't cancel bundle which is already scheduled");
17249 assert(Bundle->isSchedulingEntity() &&
17251 "tried to unbundle something which is not a bundle");
17254 if (Bundle->isReady())
17255 ReadyInsts.remove(Bundle);
17258 ScheduleData *BundleMember = Bundle;
17259 while (BundleMember) {
17260 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17261 BundleMember->FirstInBundle = BundleMember;
17262 ScheduleData *Next = BundleMember->NextInBundle;
17263 BundleMember->NextInBundle =
nullptr;
17264 BundleMember->TE =
nullptr;
17265 if (BundleMember->unscheduledDepsInBundle() == 0) {
17266 ReadyInsts.insert(BundleMember);
17268 BundleMember = Next;
17272BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17274 if (ChunkPos >= ChunkSize) {
17275 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17278 return &(ScheduleDataChunks.back()[ChunkPos++]);
17281bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17282 Value *V,
const InstructionsState &S) {
17284 assert(
I &&
"bundle member must be an instruction");
17287 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17289 if (getScheduleData(
I))
17291 if (!ScheduleStart) {
17293 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17295 ScheduleEnd =
I->getNextNode();
17296 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17297 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17305 ++ScheduleStart->getIterator().getReverse();
17310 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17311 return II->isAssumeLikeIntrinsic();
17314 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17315 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17316 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17318 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17319 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17326 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17327 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17329 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17330 assert(
I->getParent() == ScheduleStart->getParent() &&
17331 "Instruction is in wrong basic block.");
17332 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17338 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17339 "Expected to reach top of the basic block or instruction down the "
17341 assert(
I->getParent() == ScheduleEnd->getParent() &&
17342 "Instruction is in wrong basic block.");
17343 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17345 ScheduleEnd =
I->getNextNode();
17346 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17347 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17351void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17353 ScheduleData *PrevLoadStore,
17354 ScheduleData *NextLoadStore) {
17355 ScheduleData *CurrentLoadStore = PrevLoadStore;
17360 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17362 SD = allocateScheduleDataChunks();
17363 ScheduleDataMap[
I] = SD;
17365 assert(!isInSchedulingRegion(SD) &&
17366 "new ScheduleData already in scheduling region");
17367 SD->init(SchedulingRegionID,
I);
17369 if (
I->mayReadOrWriteMemory() &&
17370 (!isa<IntrinsicInst>(
I) ||
17371 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17372 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17373 Intrinsic::pseudoprobe))) {
17375 if (CurrentLoadStore) {
17376 CurrentLoadStore->NextLoadStore = SD;
17378 FirstLoadStoreInRegion = SD;
17380 CurrentLoadStore = SD;
17383 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17384 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17385 RegionHasStackSave =
true;
17387 if (NextLoadStore) {
17388 if (CurrentLoadStore)
17389 CurrentLoadStore->NextLoadStore = NextLoadStore;
17391 LastLoadStoreInRegion = CurrentLoadStore;
17395void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17396 bool InsertInReadyList,
17398 assert(SD->isSchedulingEntity());
17403 while (!WorkList.
empty()) {
17405 for (ScheduleData *BundleMember = SD; BundleMember;
17406 BundleMember = BundleMember->NextInBundle) {
17407 assert(isInSchedulingRegion(BundleMember));
17408 if (BundleMember->hasValidDependencies())
17413 BundleMember->Dependencies = 0;
17414 BundleMember->resetUnscheduledDeps();
17417 for (
User *U : BundleMember->Inst->
users()) {
17418 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17419 BundleMember->Dependencies++;
17420 ScheduleData *DestBundle = UseSD->FirstInBundle;
17421 if (!DestBundle->IsScheduled)
17422 BundleMember->incrementUnscheduledDeps(1);
17423 if (!DestBundle->hasValidDependencies())
17429 auto *DepDest = getScheduleData(
I);
17430 assert(DepDest &&
"must be in schedule window");
17431 DepDest->ControlDependencies.push_back(BundleMember);
17432 BundleMember->Dependencies++;
17433 ScheduleData *DestBundle = DepDest->FirstInBundle;
17434 if (!DestBundle->IsScheduled)
17435 BundleMember->incrementUnscheduledDeps(1);
17436 if (!DestBundle->hasValidDependencies())
17444 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17445 I != ScheduleEnd;
I =
I->getNextNode()) {
17450 MakeControlDependent(
I);
17458 if (RegionHasStackSave) {
17462 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17463 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17464 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17465 I != ScheduleEnd;
I =
I->getNextNode()) {
17466 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17467 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17472 if (!isa<AllocaInst>(
I))
17476 MakeControlDependent(
I);
17485 if (isa<AllocaInst>(BundleMember->Inst) ||
17486 BundleMember->Inst->mayReadOrWriteMemory()) {
17487 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17488 I != ScheduleEnd;
I =
I->getNextNode()) {
17489 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17490 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17494 MakeControlDependent(
I);
17501 ScheduleData *DepDest = BundleMember->NextLoadStore;
17506 "NextLoadStore list for non memory effecting bundle?");
17508 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17509 unsigned NumAliased = 0;
17510 unsigned DistToSrc = 1;
17512 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17513 assert(isInSchedulingRegion(DepDest));
17523 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17525 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17532 DepDest->MemoryDependencies.push_back(BundleMember);
17533 BundleMember->Dependencies++;
17534 ScheduleData *DestBundle = DepDest->FirstInBundle;
17535 if (!DestBundle->IsScheduled) {
17536 BundleMember->incrementUnscheduledDeps(1);
17538 if (!DestBundle->hasValidDependencies()) {
17561 if (InsertInReadyList && SD->isReady()) {
17562 ReadyInsts.insert(SD);
17569void BoUpSLP::BlockScheduling::resetSchedule() {
17571 "tried to reset schedule on block which has not been scheduled");
17572 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17573 if (ScheduleData *SD = getScheduleData(
I)) {
17574 assert(isInSchedulingRegion(SD) &&
17575 "ScheduleData not in scheduling region");
17576 SD->IsScheduled =
false;
17577 SD->resetUnscheduledDeps();
17580 ReadyInsts.clear();
17583void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17584 if (!BS->ScheduleStart)
17587 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17594 BS->resetSchedule();
17601 struct ScheduleDataCompare {
17602 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17603 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17606 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17611 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17612 I =
I->getNextNode()) {
17613 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17614 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17617 SD->isPartOfBundle() ==
17619 "scheduler and vectorizer bundle mismatch");
17620 SD->FirstInBundle->SchedulingPriority =
Idx++;
17622 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17623 BS->calculateDependencies(SD,
false,
this);
17626 BS->initialFillReadyList(ReadyInsts);
17628 Instruction *LastScheduledInst = BS->ScheduleEnd;
17631 while (!ReadyInsts.empty()) {
17632 ScheduleData *Picked = *ReadyInsts.begin();
17633 ReadyInsts.erase(ReadyInsts.begin());
17637 for (ScheduleData *BundleMember = Picked; BundleMember;
17638 BundleMember = BundleMember->NextInBundle) {
17642 LastScheduledInst = PickedInst;
17645 BS->schedule(Picked, ReadyInsts);
17649#ifdef EXPENSIVE_CHECKS
17653#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17655 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17656 ScheduleData *SD = BS->getScheduleData(
I);
17657 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17658 assert(SD->IsScheduled &&
"must be scheduled at this point");
17663 BS->ScheduleStart =
nullptr;
17670 if (
auto *Store = dyn_cast<StoreInst>(V))
17671 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17673 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17676 auto E = InstrElementSize.
find(V);
17677 if (E != InstrElementSize.
end())
17686 if (
auto *
I = dyn_cast<Instruction>(V)) {
17694 Value *FirstNonBool =
nullptr;
17695 while (!Worklist.
empty()) {
17700 auto *Ty =
I->getType();
17701 if (isa<VectorType>(Ty))
17703 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17710 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17711 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17719 for (
Use &U :
I->operands()) {
17720 if (
auto *J = dyn_cast<Instruction>(U.get()))
17721 if (Visited.
insert(J).second &&
17722 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17726 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17727 FirstNonBool = U.get();
17738 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17740 Width =
DL->getTypeSizeInBits(V->getType());
17744 InstrElementSize[
I] = Width;
17749bool BoUpSLP::collectValuesToDemote(
17750 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17753 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17755 if (
all_of(E.Scalars, IsaPred<Constant>))
17758 unsigned OrigBitWidth =
17759 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17766 if (NodesToKeepBWs.
contains(E.Idx))
17772 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17773 if (isa<PoisonValue>(R))
17775 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17777 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17778 if (isa<PoisonValue>(V))
17786 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17792 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17795 if (
auto *
I = dyn_cast<Instruction>(V)) {
17797 unsigned BitWidth2 =
17798 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17799 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17805 BitWidth1 = std::min(BitWidth1, BitWidth2);
17810 auto FinalAnalysis = [&,
TTI =
TTI]() {
17811 if (!IsProfitableToDemote)
17814 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17816 if (Res && E.isGather()) {
17820 for (
Value *V : E.Scalars) {
17821 auto *EE = dyn_cast<ExtractElementInst>(V);
17824 UniqueBases.
insert(EE->getVectorOperand());
17826 const unsigned VF = E.Scalars.size();
17827 Type *OrigScalarTy = E.Scalars.front()->getType();
17828 if (UniqueBases.
size() <= 2 ||
17836 if (E.isGather() || !Visited.
insert(&E).second ||
17838 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17839 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17842 return FinalAnalysis();
17845 return !all_of(V->users(), [=](User *U) {
17846 return getTreeEntry(U) ||
17847 (E.Idx == 0 && UserIgnoreList &&
17848 UserIgnoreList->contains(U)) ||
17849 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17850 !U->getType()->isScalableTy() &&
17851 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17852 }) && !IsPotentiallyTruncated(V,
BitWidth);
17857 bool &NeedToExit) {
17858 NeedToExit =
false;
17859 unsigned InitLevel = MaxDepthLevel;
17861 unsigned Level = InitLevel;
17862 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17863 ToDemote, Visited, NodesToKeepBWs, Level,
17864 IsProfitableToDemote, IsTruncRoot)) {
17865 if (!IsProfitableToDemote)
17868 if (!FinalAnalysis())
17872 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17876 auto AttemptCheckBitwidth =
17879 NeedToExit =
false;
17880 unsigned BestFailBitwidth = 0;
17882 if (Checker(
BitWidth, OrigBitWidth))
17884 if (BestFailBitwidth == 0 && FinalAnalysis())
17888 if (BestFailBitwidth == 0) {
17899 auto TryProcessInstruction =
17905 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17910 if (E.UserTreeIndices.size() > 1 &&
17911 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17914 bool NeedToExit =
false;
17915 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17919 if (!ProcessOperands(
Operands, NeedToExit))
17928 return IsProfitableToDemote;
17930 switch (E.getOpcode()) {
17934 case Instruction::Trunc:
17935 if (IsProfitableToDemoteRoot)
17936 IsProfitableToDemote =
true;
17937 return TryProcessInstruction(
BitWidth);
17938 case Instruction::ZExt:
17939 case Instruction::SExt:
17940 IsProfitableToDemote =
true;
17941 return TryProcessInstruction(
BitWidth);
17945 case Instruction::Add:
17946 case Instruction::Sub:
17947 case Instruction::Mul:
17948 case Instruction::And:
17949 case Instruction::Or:
17950 case Instruction::Xor: {
17951 return TryProcessInstruction(
17952 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17954 case Instruction::Freeze:
17955 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17956 case Instruction::Shl: {
17961 if (isa<PoisonValue>(V))
17963 auto *I = cast<Instruction>(V);
17964 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17965 return AmtKnownBits.getMaxValue().ult(BitWidth);
17968 return TryProcessInstruction(
17969 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17971 case Instruction::LShr: {
17975 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17977 if (isa<PoisonValue>(V))
17979 auto *I = cast<Instruction>(V);
17980 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17981 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17982 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17983 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17984 SimplifyQuery(*DL));
17987 return TryProcessInstruction(
17988 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17991 case Instruction::AShr: {
17995 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17997 if (isa<PoisonValue>(V))
17999 auto *I = cast<Instruction>(V);
18000 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18001 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18002 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18003 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18007 return TryProcessInstruction(
18008 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18011 case Instruction::UDiv:
18012 case Instruction::URem: {
18014 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18017 auto *I = cast<Instruction>(V);
18018 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18019 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18020 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18023 return TryProcessInstruction(
18024 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18028 case Instruction::Select: {
18029 return TryProcessInstruction(
18030 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18035 case Instruction::PHI: {
18036 const unsigned NumOps = E.getNumOperands();
18039 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18041 return TryProcessInstruction(
BitWidth, Ops);
18044 case Instruction::Call: {
18045 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18049 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18050 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18054 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18057 auto *I = cast<Instruction>(V);
18058 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18059 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18060 return MaskedValueIsZero(I->getOperand(0), Mask,
18061 SimplifyQuery(*DL)) &&
18062 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18064 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18065 "Expected min/max intrinsics only.");
18066 unsigned SignBits = OrigBitWidth -
BitWidth;
18072 return SignBits <= Op0SignBits &&
18073 ((SignBits != Op0SignBits &&
18077 SignBits <= Op1SignBits &&
18078 ((SignBits != Op1SignBits &&
18083 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18086 auto *I = cast<Instruction>(V);
18087 unsigned SignBits = OrigBitWidth - BitWidth;
18088 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18089 unsigned Op0SignBits =
18090 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18091 return SignBits <= Op0SignBits &&
18092 ((SignBits != Op0SignBits &&
18093 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18094 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18097 if (
ID != Intrinsic::abs) {
18098 Operands.push_back(getOperandEntry(&E, 1));
18099 CallChecker = CompChecker;
18101 CallChecker = AbsChecker;
18104 std::numeric_limits<InstructionCost::CostType>::max();
18106 unsigned VF = E.Scalars.size();
18116 if (
Cost < BestCost) {
18122 [[maybe_unused]]
bool NeedToExit;
18123 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18133 return FinalAnalysis();
18140 bool IsStoreOrInsertElt =
18141 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18142 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18143 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18144 ExtraBitWidthNodes.
size() <= 1 &&
18145 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18146 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18149 unsigned NodeIdx = 0;
18150 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18154 if (VectorizableTree[NodeIdx]->
isGather() ||
18155 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18156 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18158 return EI.
UserTE->Idx > NodeIdx;
18164 bool IsTruncRoot =
false;
18165 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18168 if (NodeIdx != 0 &&
18169 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18170 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18171 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18172 IsTruncRoot =
true;
18174 IsProfitableToDemoteRoot =
true;
18179 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18183 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
18184 bool IsProfitableToDemoteRoot,
unsigned Opcode,
18185 unsigned Limit,
bool IsTruncRoot,
18186 bool IsSignedCmp) ->
unsigned {
18190 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18191 !NodesToKeepBWs.
contains(E.Idx) &&
18192 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18194 return V->hasOneUse() || isa<Constant>(V) ||
18197 const TreeEntry *TE = getTreeEntry(U);
18198 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18199 if (TE == UserTE || !TE)
18201 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18203 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18204 SelectInst>(UserTE->getMainOp()))
18206 unsigned UserTESz = DL->getTypeSizeInBits(
18207 UserTE->Scalars.front()->getType());
18208 auto It = MinBWs.find(TE);
18209 if (It != MinBWs.end() && It->second.first > UserTESz)
18211 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18215 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18216 auto It = MinBWs.
find(UserTE);
18217 if (It != MinBWs.
end())
18218 return It->second.first;
18219 unsigned MaxBitWidth =
18220 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18221 MaxBitWidth =
bit_ceil(MaxBitWidth);
18222 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18224 return MaxBitWidth;
18227 unsigned VF = E.getVectorFactor();
18228 Type *ScalarTy = E.Scalars.front()->getType();
18230 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18231 if (!TreeRootIT || !Opcode)
18235 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18244 unsigned MaxBitWidth = 1u;
18252 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18253 if (isa<PoisonValue>(R))
18255 KnownBits Known = computeKnownBits(R, *DL);
18256 return Known.isNonNegative();
18261 for (
Value *Root : E.Scalars) {
18262 if (isa<PoisonValue>(Root))
18267 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18283 if (!IsKnownPositive)
18287 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18289 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18292 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18297 if (NumParts > 1 &&
18303 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18304 Opcode == Instruction::SExt ||
18305 Opcode == Instruction::ZExt || NumParts > 1;
18310 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18311 bool NeedToDemote = IsProfitableToDemote;
18313 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18314 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18315 NeedToDemote, IsTruncRoot) ||
18316 (MaxDepthLevel <= Limit &&
18317 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18318 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18319 DL->getTypeSizeInBits(TreeRootIT) /
18320 DL->getTypeSizeInBits(
18321 E.getMainOp()->getOperand(0)->getType()) >
18325 MaxBitWidth =
bit_ceil(MaxBitWidth);
18327 return MaxBitWidth;
18334 if (UserIgnoreList &&
18335 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18338 if (
all_of(*UserIgnoreList,
18340 return isa<PoisonValue>(V) ||
18341 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18343 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18344 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18345 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18347 ReductionBitWidth = 1;
18349 for (
Value *V : *UserIgnoreList) {
18350 if (isa<PoisonValue>(V))
18353 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18354 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18357 unsigned BitWidth2 = BitWidth1;
18360 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18362 ReductionBitWidth =
18363 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18365 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18366 ReductionBitWidth = 8;
18368 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18371 bool IsTopRoot = NodeIdx == 0;
18372 while (NodeIdx < VectorizableTree.size() &&
18373 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18374 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18377 IsTruncRoot =
true;
18379 bool IsSignedCmp =
false;
18380 while (NodeIdx < VectorizableTree.size()) {
18382 unsigned Limit = 2;
18383 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18385 ReductionBitWidth ==
18386 DL->getTypeSizeInBits(
18387 VectorizableTree.front()->Scalars.front()->getType()))
18389 unsigned MaxBitWidth = ComputeMaxBitWidth(
18390 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18391 Limit, IsTruncRoot, IsSignedCmp);
18392 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18393 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18394 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18395 else if (MaxBitWidth == 0)
18396 ReductionBitWidth = 0;
18399 for (
unsigned Idx : RootDemotes) {
18402 DL->getTypeSizeInBits(V->getType()->getScalarType());
18403 if (OrigBitWidth > MaxBitWidth) {
18411 RootDemotes.clear();
18413 IsProfitableToDemoteRoot =
true;
18415 if (ExtraBitWidthNodes.
empty()) {
18416 NodeIdx = VectorizableTree.size();
18418 unsigned NewIdx = 0;
18420 NewIdx = *ExtraBitWidthNodes.
begin();
18421 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18422 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18425 NodeIdx < VectorizableTree.size() &&
18426 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18429 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18430 !EI.
UserTE->isAltShuffle();
18433 NodeIdx < VectorizableTree.size() &&
18434 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18436 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
18438 auto *IC = dyn_cast<ICmpInst>(V);
18441 !isKnownNonNegative(IC->getOperand(0),
18442 SimplifyQuery(*DL)) ||
18443 !isKnownNonNegative(IC->getOperand(1),
18444 SimplifyQuery(*DL)));
18451 if (MaxBitWidth == 0 ||
18453 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18455 if (UserIgnoreList)
18463 for (
unsigned Idx : ToDemote) {
18464 TreeEntry *TE = VectorizableTree[
Idx].get();
18467 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18468 if (isa<PoisonValue>(R))
18470 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18488 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18513 DL = &
F.getDataLayout();
18517 bool Changed =
false;
18523 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18528 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18531 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18535 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18544 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18549 R.clearReductionData();
18550 collectSeedInstructions(BB);
18553 if (!Stores.
empty()) {
18555 <<
" underlying objects.\n");
18556 Changed |= vectorizeStoreChains(R);
18560 Changed |= vectorizeChainsInBlock(BB, R);
18565 if (!GEPs.
empty()) {
18567 <<
" underlying objects.\n");
18568 Changed |= vectorizeGEPIndices(BB, R);
18573 R.optimizeGatherSequence();
18581 unsigned Idx,
unsigned MinVF,
18586 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18587 unsigned VF = Chain.
size();
18591 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18593 VF < 2 || VF < MinVF) {
18605 for (
Value *V : Chain)
18606 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18609 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18611 bool IsAllowedSize =
18615 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18616 (!S.getMainOp()->isSafeToRemove() ||
18619 return !isa<ExtractElementInst>(V) &&
18620 (V->getNumUses() > Chain.size() ||
18621 any_of(V->users(), [&](User *U) {
18622 return !Stores.contains(U);
18625 (ValOps.
size() > Chain.size() / 2 && !S)) {
18626 Size = (!IsAllowedSize && S) ? 1 : 2;
18630 if (
R.isLoadCombineCandidate(Chain))
18632 R.buildTree(Chain);
18634 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18635 if (
R.isGathered(Chain.front()) ||
18636 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18637 return std::nullopt;
18638 Size =
R.getCanonicalGraphSize();
18641 R.reorderTopToBottom();
18642 R.reorderBottomToTop();
18643 R.transformNodes();
18644 R.buildExternalUses();
18646 R.computeMinimumValueSizes();
18648 Size =
R.getCanonicalGraphSize();
18649 if (S && S.getOpcode() == Instruction::Load)
18657 using namespace ore;
18660 cast<StoreInst>(Chain[0]))
18661 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18662 <<
" and with tree size "
18663 <<
NV(
"TreeSize",
R.getTreeSize()));
18677 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18678 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18679 unsigned Size = First ? Val.first : Val.second;
18691 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18692 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18693 unsigned P = First ? Val.first : Val.second;
18696 return V + (P - Mean) * (P - Mean);
18699 return Dev * 81 / (Mean * Mean) == 0;
18702bool SLPVectorizerPass::vectorizeStores(
18704 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18709 bool Changed =
false;
18711 struct StoreDistCompare {
18712 bool operator()(
const std::pair<unsigned, int> &Op1,
18713 const std::pair<unsigned, int> &Op2)
const {
18714 return Op1.second < Op2.second;
18719 using StoreIndexToDistSet =
18720 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18721 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18726 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18728 PrevDist =
Data.second;
18729 if (
Idx !=
Set.size() - 1)
18734 Operands.push_back(Stores[DataVar.first]);
18735 PrevDist = DataVar.second;
18740 .
insert({Operands.front(),
18741 cast<StoreInst>(Operands.front())->getValueOperand(),
18743 cast<StoreInst>(Operands.back())->getValueOperand(),
18748 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18749 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18753 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18755 Type *StoreTy =
Store->getValueOperand()->getType();
18756 Type *ValueTy = StoreTy;
18757 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18758 ValueTy = Trunc->getSrcTy();
18759 unsigned MinVF = std::max<unsigned>(
18761 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18764 if (MaxVF < MinVF) {
18765 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18767 <<
"MinVF (" << MinVF <<
")\n");
18771 unsigned NonPowerOf2VF = 0;
18776 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18778 NonPowerOf2VF = CandVF;
18779 assert(NonPowerOf2VF != MaxVF &&
18780 "Non-power-of-2 VF should not be equal to MaxVF");
18784 unsigned MaxRegVF = MaxVF;
18786 if (MaxVF < MinVF) {
18787 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18789 <<
"MinVF (" << MinVF <<
")\n");
18795 unsigned Size = MinVF;
18797 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18801 unsigned Repeat = 0;
18802 constexpr unsigned MaxAttempts = 4;
18804 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18805 P.first =
P.second = 1;
18808 auto IsNotVectorized = [](
bool First,
18809 const std::pair<unsigned, unsigned> &
P) {
18810 return First ?
P.first > 0 :
P.second > 0;
18812 auto IsVectorized = [](
bool First,
18813 const std::pair<unsigned, unsigned> &
P) {
18814 return First ?
P.first == 0 :
P.second == 0;
18816 auto VFIsProfitable = [](
bool First,
unsigned Size,
18817 const std::pair<unsigned, unsigned> &
P) {
18820 auto FirstSizeSame = [](
unsigned Size,
18821 const std::pair<unsigned, unsigned> &
P) {
18822 return Size ==
P.first;
18826 bool RepeatChanged =
false;
18827 bool AnyProfitableGraph =
false;
18828 for (
unsigned Size : CandidateVFs) {
18829 AnyProfitableGraph =
false;
18830 unsigned StartIdx = std::distance(
18831 RangeSizes.begin(),
18832 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18833 std::placeholders::_1)));
18834 while (StartIdx <
End) {
18836 std::distance(RangeSizes.begin(),
18837 find_if(RangeSizes.drop_front(StartIdx),
18838 std::bind(IsVectorized,
Size >= MaxRegVF,
18839 std::placeholders::_1)));
18840 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18841 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18843 Size >= MaxRegVF)) {
18850 return cast<StoreInst>(V)
18851 ->getValueOperand()
18853 cast<StoreInst>(Slice.
front())
18854 ->getValueOperand()
18857 "Expected all operands of same type.");
18858 if (!NonSchedulable.empty()) {
18859 auto [NonSchedSizeMax, NonSchedSizeMin] =
18860 NonSchedulable.lookup(Slice.
front());
18861 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18862 Cnt += NonSchedSizeMax;
18867 std::optional<bool> Res =
18868 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18872 .first->getSecond()
18880 AnyProfitableGraph = RepeatChanged = Changed =
true;
18884 [](std::pair<unsigned, unsigned> &
P) {
18885 P.first = P.second = 0;
18887 if (Cnt < StartIdx + MinVF) {
18888 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18889 [](std::pair<unsigned, unsigned> &
P) {
18890 P.first = P.second = 0;
18892 StartIdx = Cnt +
Size;
18894 if (Cnt > Sz -
Size - MinVF) {
18896 [](std::pair<unsigned, unsigned> &
P) {
18897 P.first = P.second = 0;
18906 if (
Size > 2 && Res &&
18908 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18909 std::placeholders::_1))) {
18915 if (
Size > MaxRegVF && TreeSize > 1 &&
18917 std::bind(FirstSizeSame, TreeSize,
18918 std::placeholders::_1))) {
18920 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18926 [&](std::pair<unsigned, unsigned> &
P) {
18927 if (Size >= MaxRegVF)
18928 P.second = std::max(P.second, TreeSize);
18930 P.first = std::max(P.first, TreeSize);
18933 AnyProfitableGraph =
true;
18935 if (StartIdx >=
End)
18937 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18938 AnyProfitableGraph =
true;
18939 StartIdx = std::distance(
18940 RangeSizes.begin(),
18941 find_if(RangeSizes.drop_front(Sz),
18942 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18943 std::placeholders::_1)));
18949 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18950 return P.first == 0 &&
P.second == 0;
18954 if (Repeat >= MaxAttempts ||
18955 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18957 constexpr unsigned StoresLimit = 64;
18958 const unsigned MaxTotalNum = std::min<unsigned>(
18960 static_cast<unsigned>(
18963 RangeSizes.begin(),
18964 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18965 std::placeholders::_1))) +
18967 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18970 CandidateVFs.clear();
18972 CandidateVFs.push_back(Limit);
18973 if (VF > MaxTotalNum || VF >= StoresLimit)
18975 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18977 P.first = std::max(
P.second,
P.first);
18981 CandidateVFs.push_back(VF);
19028 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19030 Stores[
Set.first]->getValueOperand()->getType(),
19031 Stores[
Set.first]->getPointerOperand(),
19032 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19036 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19037 if (It ==
Set.second.end()) {
19038 Set.second.emplace(
Idx, *Diff);
19042 TryToVectorize(
Set.second);
19043 unsigned ItIdx = It->first;
19044 int ItDist = It->second;
19045 StoreIndexToDistSet PrevSet;
19046 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19047 [&](
const std::pair<unsigned, int> &Pair) {
19048 return Pair.first > ItIdx;
19050 Set.second.clear();
19052 Set.second.emplace(
Idx, 0);
19055 unsigned StartIdx = ItIdx + 1;
19060 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19062 if (VectorizedStores.
contains(Stores[Pair.first]))
19064 unsigned BI = Pair.first - StartIdx;
19065 UsedStores.set(BI);
19066 Dists[BI] = Pair.second - ItDist;
19068 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19069 unsigned BI =
I - StartIdx;
19070 if (UsedStores.test(BI))
19071 Set.second.emplace(
I, Dists[BI]);
19075 auto &Res = SortedStores.emplace_back();
19077 Res.second.emplace(
Idx, 0);
19079 Type *PrevValTy =
nullptr;
19081 if (
R.isDeleted(SI))
19084 PrevValTy =
SI->getValueOperand()->getType();
19086 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19087 for (
auto &Set : SortedStores)
19088 TryToVectorize(
Set.second);
19089 SortedStores.clear();
19090 PrevValTy =
SI->getValueOperand()->getType();
19092 FillStoresSet(
I, SI);
19096 for (
auto &Set : SortedStores)
19097 TryToVectorize(
Set.second);
19102void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19113 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19114 if (!
SI->isSimple())
19124 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19125 if (
GEP->getNumIndices() != 1)
19128 if (isa<Constant>(
Idx))
19132 if (
GEP->getType()->isVectorTy())
19144 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19145 << VL.
size() <<
".\n");
19156 for (
Value *V : VL) {
19157 Type *Ty =
V->getType();
19161 R.getORE()->emit([&]() {
19162 std::string TypeStr;
19166 <<
"Cannot SLP vectorize list: type "
19167 << TypeStr +
" is unsupported by vectorizer";
19173 unsigned Sz =
R.getVectorElementSize(I0);
19174 unsigned MinVF =
R.getMinVF(Sz);
19175 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
19176 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19178 R.getORE()->emit([&]() {
19180 <<
"Cannot SLP vectorize list: vectorization factor "
19181 <<
"less than 2 is not supported";
19186 bool Changed =
false;
19187 bool CandidateFound =
false;
19191 unsigned NextInst = 0, MaxInst = VL.size();
19192 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19199 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19200 unsigned ActualVF = std::min(MaxInst -
I, VF);
19205 if (MaxVFOnly && ActualVF < MaxVF)
19207 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19212 for (
Value *V : VL.drop_front(
I)) {
19215 if (
auto *Inst = dyn_cast<Instruction>(V);
19216 !Inst || !
R.isDeleted(Inst)) {
19219 if (
Idx == ActualVF)
19224 if (
Idx != ActualVF)
19227 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19231 if (
R.isTreeTinyAndNotFullyVectorizable())
19233 R.reorderTopToBottom();
19234 R.reorderBottomToTop(
19235 !isa<InsertElementInst>(Ops.
front()) &&
19236 !
R.doesRootHaveInTreeUses());
19237 R.transformNodes();
19238 R.buildExternalUses();
19240 R.computeMinimumValueSizes();
19242 CandidateFound =
true;
19243 MinCost = std::min(MinCost,
Cost);
19246 <<
" for VF=" << ActualVF <<
"\n");
19250 cast<Instruction>(Ops[0]))
19251 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19252 <<
" and with tree size "
19253 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19264 if (!Changed && CandidateFound) {
19265 R.getORE()->emit([&]() {
19267 <<
"List vectorization was possible but not beneficial with cost "
19268 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19271 }
else if (!Changed) {
19272 R.getORE()->emit([&]() {
19274 <<
"Cannot SLP vectorize list: vectorization was impossible"
19275 <<
" with available vectorization factors";
19285 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19291 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19292 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19293 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19294 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19301 auto *
A = dyn_cast<BinaryOperator>(Op0);
19302 auto *
B = dyn_cast<BinaryOperator>(Op1);
19304 if (
A &&
B &&
B->hasOneUse()) {
19305 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19306 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19307 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19309 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19313 if (
B &&
A &&
A->hasOneUse()) {
19314 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19315 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19316 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19318 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19322 if (Candidates.
size() == 1)
19323 return tryToVectorizeList({Op0, Op1},
R);
19326 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19327 if (!BestCandidate)
19329 return tryToVectorizeList(
19330 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19364 ReductionOpsListType ReductionOps;
19374 bool IsSupportedHorRdxIdentityOp =
false;
19385 return isa<SelectInst>(
I) &&
19391 if (Kind == RecurKind::None)
19399 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19403 return I->getFastMathFlags().noNaNs();
19406 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19409 return I->isAssociative();
19418 return I->getOperand(2);
19419 return I->getOperand(
Index);
19426 case RecurKind::Or: {
19434 case RecurKind::And: {
19442 case RecurKind::Add:
19443 case RecurKind::Mul:
19444 case RecurKind::Xor:
19445 case RecurKind::FAdd:
19446 case RecurKind::FMul: {
19451 case RecurKind::SMax:
19452 case RecurKind::SMin:
19453 case RecurKind::UMax:
19454 case RecurKind::UMin:
19461 case RecurKind::FMax:
19462 case RecurKind::FMin:
19463 case RecurKind::FMaximum:
19464 case RecurKind::FMinimum: {
19477 const ReductionOpsListType &ReductionOps) {
19478 bool UseSelect = ReductionOps.size() == 2 ||
19480 (ReductionOps.size() == 1 &&
19481 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19482 assert((!UseSelect || ReductionOps.size() != 2 ||
19483 isa<SelectInst>(ReductionOps[1][0])) &&
19484 "Expected cmp + select pairs for reduction");
19487 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19501 auto *
I = dyn_cast<Instruction>(V);
19503 return RecurKind::None;
19505 return RecurKind::Add;
19507 return RecurKind::Mul;
19510 return RecurKind::And;
19513 return RecurKind::Or;
19515 return RecurKind::Xor;
19517 return RecurKind::FAdd;
19519 return RecurKind::FMul;
19522 return RecurKind::FMax;
19524 return RecurKind::FMin;
19527 return RecurKind::FMaximum;
19529 return RecurKind::FMinimum;
19535 return RecurKind::SMax;
19537 return RecurKind::SMin;
19539 return RecurKind::UMax;
19541 return RecurKind::UMin;
19543 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19565 if (!isa<ExtractElementInst>(
RHS) ||
19567 return RecurKind::None;
19569 if (!isa<ExtractElementInst>(
LHS) ||
19571 return RecurKind::None;
19573 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19574 return RecurKind::None;
19578 return RecurKind::None;
19583 return RecurKind::None;
19586 return RecurKind::SMax;
19589 return RecurKind::SMin;
19592 return RecurKind::UMax;
19595 return RecurKind::UMin;
19598 return RecurKind::None;
19602 static unsigned getFirstOperandIndex(
Instruction *
I) {
19603 return isCmpSelMinMax(
I) ? 1 : 0;
19609 return isCmpSelMinMax(
I) ? 3 : 2;
19615 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19616 auto *Sel = cast<SelectInst>(
I);
19617 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19618 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19620 return I->getParent() == BB;
19624 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19625 if (IsCmpSelMinMax) {
19628 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19629 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19630 return I->hasNUses(2);
19634 return I->hasOneUse();
19639 if (isCmpSelMinMax(
I))
19640 ReductionOps.assign(2, ReductionOpsType());
19642 ReductionOps.assign(1, ReductionOpsType());
19647 if (isCmpSelMinMax(
I)) {
19648 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19649 ReductionOps[1].emplace_back(
I);
19651 ReductionOps[0].emplace_back(
I);
19656 int Sz = Data.size();
19657 auto *
I = dyn_cast<Instruction>(Data.front());
19658 return Sz > 1 ||
isConstant(Data.front()) ||
19669 RdxKind = HorizontalReduction::getRdxKind(Root);
19670 if (!isVectorizable(RdxKind, Root))
19681 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19682 if (!Sel->getCondition()->hasOneUse())
19685 ReductionRoot = Root;
19690 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19692 1, std::make_pair(Root, 0));
19700 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19701 getNumberOfOperands(TreeN)))) {
19702 Value *EdgeVal = getRdxOperand(TreeN,
I);
19703 ReducedValsToOps[EdgeVal].push_back(TreeN);
19704 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19711 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19712 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19713 !isVectorizable(RdxKind, EdgeInst) ||
19714 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19715 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19716 PossibleReducedVals.push_back(EdgeVal);
19719 ReductionOps.push_back(EdgeInst);
19730 PossibleReducedVals;
19731 initReductionOps(Root);
19735 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19739 if (!LoadKeyUsed.
insert(Key).second) {
19740 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19741 if (LIt != LoadsMap.
end()) {
19742 for (
LoadInst *RLI : LIt->second) {
19748 for (
LoadInst *RLI : LIt->second) {
19755 if (LIt->second.size() > 2) {
19757 hash_value(LIt->second.back()->getPointerOperand());
19763 .first->second.push_back(LI);
19767 while (!Worklist.empty()) {
19768 auto [TreeN, Level] = Worklist.pop_back_val();
19771 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19772 addReductionOps(TreeN);
19775 for (
Value *V : PossibleRedVals) {
19779 ++PossibleReducedVals[
Key][
Idx]
19780 .
insert(std::make_pair(V, 0))
19784 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19786 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19789 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19790 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19792 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19795 auto RedValsVect = It->second.takeVector();
19797 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19798 PossibleRedValsVect.
back().append(Data.second, Data.first);
19800 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19801 return P1.size() > P2.size();
19806 (!isGoodForReduction(Data) &&
19807 (!isa<LoadInst>(Data.front()) ||
19808 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19810 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19812 cast<LoadInst>(ReducedVals[NewIdx].front())
19814 NewIdx = ReducedVals.
size();
19817 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19832 constexpr unsigned RegMaxNumber = 4;
19833 constexpr unsigned RedValsMaxNumber = 128;
19837 if (
unsigned NumReducedVals = std::accumulate(
19838 ReducedVals.
begin(), ReducedVals.
end(), 0,
19840 if (!isGoodForReduction(Vals))
19842 return Num + Vals.size();
19844 NumReducedVals < ReductionLimit &&
19848 for (ReductionOpsType &RdxOps : ReductionOps)
19849 for (
Value *RdxOp : RdxOps)
19850 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19861 ReducedVals.
front().size());
19865 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19866 assert(isa<SelectInst>(RdxRootInst) &&
19867 "Expected min/max reduction to have select root instruction");
19868 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19869 assert(isa<Instruction>(ScalarCond) &&
19870 "Expected min/max reduction to have compare condition");
19871 return cast<Instruction>(ScalarCond);
19874 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19875 return isBoolLogicOp(cast<Instruction>(V));
19878 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19879 if (VectorizedTree) {
19882 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19883 if (AnyBoolLogicOp) {
19884 auto It = ReducedValsToOps.
find(VectorizedTree);
19885 auto It1 = ReducedValsToOps.
find(Res);
19886 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19888 (It != ReducedValsToOps.
end() &&
19890 return isBoolLogicOp(I) &&
19891 getRdxOperand(I, 0) == VectorizedTree;
19895 (It1 != ReducedValsToOps.
end() &&
19897 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19901 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19905 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19912 ReductionOps.front().size());
19913 for (ReductionOpsType &RdxOps : ReductionOps)
19914 for (
Value *RdxOp : RdxOps) {
19917 IgnoreList.insert(RdxOp);
19922 for (
Value *U : IgnoreList)
19923 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19924 RdxFMF &= FPMO->getFastMathFlags();
19925 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19930 for (
Value *V : Candidates)
19931 TrackedVals.try_emplace(V, V);
19934 Value *
V) ->
unsigned & {
19935 auto *It = MV.
find(V);
19936 assert(It != MV.
end() &&
"Unable to find given key.");
19945 bool CheckForReusedReductionOps =
false;
19950 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19952 InstructionsState S = States[
I];
19956 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19957 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19962 auto *Inst = dyn_cast<Instruction>(RdxVal);
19964 (!S || !S.isOpcodeOrAlt(Inst))) ||
19968 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19970 bool ShuffledExtracts =
false;
19972 if (S && S.getOpcode() == Instruction::ExtractElement &&
19973 !S.isAltShuffle() &&
I + 1 <
E) {
19975 for (
Value *RV : ReducedVals[
I + 1]) {
19976 Value *RdxVal = TrackedVals.at(RV);
19980 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19983 CommonCandidates.push_back(RdxVal);
19984 TrackedToOrig.try_emplace(RdxVal, RV);
19989 Candidates.
swap(CommonCandidates);
19990 ShuffledExtracts =
true;
19997 Value *OrigV = TrackedToOrig.at(Candidates.
front());
19998 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20000 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
20001 Value *OrigV = TrackedToOrig.at(VC);
20002 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20003 if (
auto *ResI = dyn_cast<Instruction>(Res))
20004 V.analyzedReductionRoot(ResI);
20006 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20010 unsigned NumReducedVals = Candidates.
size();
20011 if (NumReducedVals < ReductionLimit &&
20012 (NumReducedVals < 2 || !
isSplat(Candidates)))
20017 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20018 RdxKind != RecurKind::FMul &&
20019 RdxKind != RecurKind::FMulAdd;
20022 if (IsSupportedHorRdxIdentityOp)
20023 for (
Value *V : Candidates) {
20024 Value *OrigV = TrackedToOrig.at(V);
20025 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20037 bool SameScaleFactor =
false;
20038 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20039 SameValuesCounter.
size() != Candidates.size();
20041 if (OptReusedScalars) {
20043 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20044 RdxKind == RecurKind::Xor) &&
20046 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20047 return P.second == SameValuesCounter.
front().second;
20049 Candidates.resize(SameValuesCounter.
size());
20050 transform(SameValuesCounter, Candidates.begin(),
20051 [&](
const auto &
P) { return TrackedVals.at(P.first); });
20052 NumReducedVals = Candidates.size();
20054 if (NumReducedVals == 1) {
20055 Value *OrigV = TrackedToOrig.at(Candidates.front());
20056 unsigned Cnt = At(SameValuesCounter, OrigV);
20058 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20059 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20060 VectorizedVals.try_emplace(OrigV, Cnt);
20061 ExternallyUsedValues.
insert(OrigV);
20066 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20067 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20068 const unsigned MaxElts = std::clamp<unsigned>(
20070 RegMaxNumber * RedValsMaxNumber);
20072 unsigned ReduxWidth = NumReducedVals;
20073 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20074 unsigned NumParts, NumRegs;
20075 Type *ScalarTy = Candidates.front()->getType();
20082 while (NumParts > NumRegs) {
20083 ReduxWidth =
bit_floor(ReduxWidth - 1);
20089 if (NumParts > NumRegs / 2)
20094 ReduxWidth = GetVectorFactor(ReduxWidth);
20095 ReduxWidth = std::min(ReduxWidth, MaxElts);
20097 unsigned Start = 0;
20098 unsigned Pos = Start;
20100 unsigned PrevReduxWidth = ReduxWidth;
20101 bool CheckForReusedReductionOpsLocal =
false;
20102 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20103 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20104 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20107 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20110 if (Pos < NumReducedVals - ReduxWidth + 1)
20111 return IsAnyRedOpGathered;
20114 if (ReduxWidth > 1)
20115 ReduxWidth = GetVectorFactor(ReduxWidth);
20116 return IsAnyRedOpGathered;
20118 bool AnyVectorized =
false;
20120 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20121 ReduxWidth >= ReductionLimit) {
20124 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20126 CheckForReusedReductionOps =
true;
20129 PrevReduxWidth = ReduxWidth;
20132 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20135 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20137 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20139 V.areAnalyzedReductionVals(VL)) {
20140 (void)AdjustReducedVals(
true);
20146 auto *RedValI = dyn_cast<Instruction>(RedVal);
20149 return V.isDeleted(RedValI);
20152 V.buildTree(VL, IgnoreList);
20153 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20154 if (!AdjustReducedVals())
20155 V.analyzedReductionVals(VL);
20158 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20159 if (!AdjustReducedVals())
20160 V.analyzedReductionVals(VL);
20163 V.reorderTopToBottom();
20165 V.reorderBottomToTop(
true);
20169 ExternallyUsedValues);
20173 LocalExternallyUsedValues.insert(ReductionRoot);
20174 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20175 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20177 for (
Value *V : ReducedVals[Cnt])
20178 if (isa<Instruction>(V))
20179 LocalExternallyUsedValues.insert(TrackedVals[V]);
20181 if (!IsSupportedHorRdxIdentityOp) {
20184 "Reused values counter map is not empty");
20185 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20186 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20188 Value *
V = Candidates[Cnt];
20189 Value *OrigV = TrackedToOrig.at(V);
20190 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20193 V.transformNodes();
20197 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20198 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20200 Value *RdxVal = Candidates[Cnt];
20201 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20202 RdxVal = It->second;
20203 if (!Visited.
insert(RdxVal).second)
20207 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20208 LocalExternallyUsedValues.insert(RdxVal);
20211 Value *OrigV = TrackedToOrig.at(RdxVal);
20213 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20214 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20215 LocalExternallyUsedValues.insert(RdxVal);
20218 if (!IsSupportedHorRdxIdentityOp)
20219 SameValuesCounter.
clear();
20220 for (
Value *RdxVal : VL)
20221 if (RequiredExtract.
contains(RdxVal))
20222 LocalExternallyUsedValues.insert(RdxVal);
20223 V.buildExternalUses(LocalExternallyUsedValues);
20225 V.computeMinimumValueSizes();
20230 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20233 <<
" for reduction\n");
20237 V.getORE()->emit([&]() {
20239 ReducedValsToOps.
at(VL[0]).front())
20240 <<
"Vectorizing horizontal reduction is possible "
20241 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20242 <<
" and threshold "
20245 if (!AdjustReducedVals()) {
20246 V.analyzedReductionVals(VL);
20247 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20248 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20251 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20252 VF >= ReductionLimit;
20254 *
TTI, VL.front()->getType(), VF - 1)) {
20256 V.getCanonicalGraphSize() !=
V.getTreeSize())
20258 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20266 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20267 <<
Cost <<
". (HorRdx)\n");
20268 V.getORE()->emit([&]() {
20270 ReducedValsToOps.
at(VL[0]).front())
20271 <<
"Vectorized horizontal reduction with cost "
20272 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20273 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20280 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20282 if (IsCmpSelMinMax)
20283 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20286 Value *VectorizedRoot =
20287 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20290 for (
Value *RdxVal : Candidates) {
20291 Value *OrigVal = TrackedToOrig.at(RdxVal);
20292 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20293 if (TransformedRdxVal != RdxVal)
20294 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20303 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20306 if (OptReusedScalars && !SameScaleFactor) {
20307 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20308 SameValuesCounter, TrackedToOrig);
20311 Value *ReducedSubTree;
20312 Type *ScalarTy = VL.front()->getType();
20313 if (isa<FixedVectorType>(ScalarTy)) {
20318 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20336 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20339 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20342 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20343 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20344 "Expected different reduction type.");
20346 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20347 V.isSignedMinBitwidthRootNode());
20353 if (OptReusedScalars && SameScaleFactor)
20354 ReducedSubTree = emitScaleForReusedOps(
20355 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20357 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20359 for (
Value *RdxVal : VL) {
20360 Value *OrigV = TrackedToOrig.at(RdxVal);
20361 if (IsSupportedHorRdxIdentityOp) {
20362 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20365 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20366 if (!
V.isVectorized(RdxVal))
20367 RequiredExtract.
insert(RdxVal);
20371 ReduxWidth = NumReducedVals - Pos;
20372 if (ReduxWidth > 1)
20373 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20374 AnyVectorized =
true;
20376 if (OptReusedScalars && !AnyVectorized) {
20377 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20378 Value *RdxVal = TrackedVals.at(
P.first);
20379 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20380 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20381 VectorizedVals.try_emplace(
P.first,
P.second);
20386 if (VectorizedTree) {
20407 if (!AnyBoolLogicOp)
20409 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20410 getRdxOperand(RedOp1, 0) ==
LHS ||
20413 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20414 getRdxOperand(RedOp2, 0) ==
RHS ||
20419 if (
LHS != VectorizedTree)
20430 unsigned Sz = InstVals.
size();
20433 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20436 Value *RdxVal1 = InstVals[
I].second;
20437 Value *StableRdxVal1 = RdxVal1;
20438 auto It1 = TrackedVals.find(RdxVal1);
20439 if (It1 != TrackedVals.end())
20440 StableRdxVal1 = It1->second;
20441 Value *RdxVal2 = InstVals[
I + 1].second;
20442 Value *StableRdxVal2 = RdxVal2;
20443 auto It2 = TrackedVals.find(RdxVal2);
20444 if (It2 != TrackedVals.end())
20445 StableRdxVal2 = It2->second;
20449 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20451 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20452 StableRdxVal2,
"op.rdx", ReductionOps);
20453 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20456 ExtraReds[Sz / 2] = InstVals.
back();
20460 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20464 for (
Value *RdxVal : Candidates) {
20465 if (!Visited.
insert(RdxVal).second)
20467 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20474 bool InitStep =
true;
20475 while (ExtraReductions.
size() > 1) {
20477 FinalGen(ExtraReductions, InitStep);
20478 ExtraReductions.
swap(NewReds);
20481 VectorizedTree = ExtraReductions.
front().second;
20483 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20492 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20499 for (
auto *U :
Ignore->users()) {
20501 "All users must be either in the reduction ops list.");
20504 if (!
Ignore->use_empty()) {
20506 Ignore->replaceAllUsesWith(
P);
20509 V.removeInstructionsAndOperands(RdxOps);
20511 }
else if (!CheckForReusedReductionOps) {
20512 for (ReductionOpsType &RdxOps : ReductionOps)
20513 for (
Value *RdxOp : RdxOps)
20514 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20516 return VectorizedTree;
20526 Type *ScalarTy = ReducedVals.
front()->getType();
20527 unsigned ReduxWidth = ReducedVals.
size();
20536 int Cnt = ReducedVals.
size();
20537 for (
Value *RdxVal : ReducedVals) {
20542 Cost += GenCostFn();
20547 auto *RdxOp = cast<Instruction>(U);
20548 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20556 Cost += ScalarCost;
20558 Cost += GenCostFn();
20563 case RecurKind::Add:
20564 case RecurKind::Mul:
20565 case RecurKind::Or:
20566 case RecurKind::And:
20567 case RecurKind::Xor:
20568 case RecurKind::FAdd:
20569 case RecurKind::FMul: {
20572 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20575 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20587 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20588 std::make_pair(RedTy,
true));
20589 if (RType == RedTy) {
20599 ScalarCost = EvaluateScalarCost([&]() {
20604 case RecurKind::FMax:
20605 case RecurKind::FMin:
20606 case RecurKind::FMaximum:
20607 case RecurKind::FMinimum:
20608 case RecurKind::SMax:
20609 case RecurKind::SMin:
20610 case RecurKind::UMax:
20611 case RecurKind::UMin: {
20615 ScalarCost = EvaluateScalarCost([&]() {
20625 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20627 <<
" (It is a splitting reduction)\n");
20628 return VectorCost - ScalarCost;
20634 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20635 assert(RdxKind != RecurKind::FMulAdd &&
20636 "A call to the llvm.fmuladd intrinsic is not handled yet");
20638 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20639 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20640 RdxKind == RecurKind::Add &&
20645 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20646 ++NumVectorInstructions;
20649 ++NumVectorInstructions;
20656 assert(IsSupportedHorRdxIdentityOp &&
20657 "The optimization of matched scalar identity horizontal reductions "
20658 "must be supported.");
20660 return VectorizedValue;
20662 case RecurKind::Add: {
20664 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20666 << VectorizedValue <<
". (HorRdx)\n");
20667 return Builder.
CreateMul(VectorizedValue, Scale);
20669 case RecurKind::Xor: {
20671 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20672 <<
". (HorRdx)\n");
20675 return VectorizedValue;
20677 case RecurKind::FAdd: {
20679 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20681 << VectorizedValue <<
". (HorRdx)\n");
20682 return Builder.
CreateFMul(VectorizedValue, Scale);
20684 case RecurKind::And:
20685 case RecurKind::Or:
20686 case RecurKind::SMax:
20687 case RecurKind::SMin:
20688 case RecurKind::UMax:
20689 case RecurKind::UMin:
20690 case RecurKind::FMax:
20691 case RecurKind::FMin:
20692 case RecurKind::FMaximum:
20693 case RecurKind::FMinimum:
20695 return VectorizedValue;
20696 case RecurKind::Mul:
20697 case RecurKind::FMul:
20698 case RecurKind::FMulAdd:
20699 case RecurKind::IAnyOf:
20700 case RecurKind::FAnyOf:
20701 case RecurKind::IFindLastIV:
20702 case RecurKind::FFindLastIV:
20703 case RecurKind::None:
20715 assert(IsSupportedHorRdxIdentityOp &&
20716 "The optimization of matched scalar identity horizontal reductions "
20717 "must be supported.");
20719 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20720 if (VTy->getElementType() != VL.
front()->getType()) {
20724 R.isSignedMinBitwidthRootNode());
20727 case RecurKind::Add: {
20730 for (
Value *V : VL) {
20731 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20732 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20736 << VectorizedValue <<
". (HorRdx)\n");
20737 return Builder.
CreateMul(VectorizedValue, Scale);
20739 case RecurKind::And:
20740 case RecurKind::Or:
20743 <<
". (HorRdx)\n");
20744 return VectorizedValue;
20745 case RecurKind::SMax:
20746 case RecurKind::SMin:
20747 case RecurKind::UMax:
20748 case RecurKind::UMin:
20749 case RecurKind::FMax:
20750 case RecurKind::FMin:
20751 case RecurKind::FMaximum:
20752 case RecurKind::FMinimum:
20755 <<
". (HorRdx)\n");
20756 return VectorizedValue;
20757 case RecurKind::Xor: {
20763 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20765 std::iota(
Mask.begin(),
Mask.end(), 0);
20766 bool NeedShuffle =
false;
20767 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20769 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20770 if (Cnt % 2 == 0) {
20772 NeedShuffle =
true;
20778 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20782 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20783 return VectorizedValue;
20785 case RecurKind::FAdd: {
20788 for (
Value *V : VL) {
20789 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20790 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20793 return Builder.
CreateFMul(VectorizedValue, Scale);
20795 case RecurKind::Mul:
20796 case RecurKind::FMul:
20797 case RecurKind::FMulAdd:
20798 case RecurKind::IAnyOf:
20799 case RecurKind::FAnyOf:
20800 case RecurKind::IFindLastIV:
20801 case RecurKind::FFindLastIV:
20802 case RecurKind::None:
20812 return HorizontalReduction::getRdxKind(V);
20815 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20816 return cast<FixedVectorType>(IE->getType())->getNumElements();
20818 unsigned AggregateSize = 1;
20819 auto *
IV = cast<InsertValueInst>(InsertInst);
20820 Type *CurrentType =
IV->getType();
20822 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20823 for (
auto *Elt : ST->elements())
20824 if (Elt != ST->getElementType(0))
20825 return std::nullopt;
20826 AggregateSize *= ST->getNumElements();
20827 CurrentType = ST->getElementType(0);
20828 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20829 AggregateSize *= AT->getNumElements();
20830 CurrentType = AT->getElementType();
20831 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20832 AggregateSize *= VT->getNumElements();
20833 return AggregateSize;
20835 return AggregateSize;
20837 return std::nullopt;
20846 unsigned OperandOffset,
const BoUpSLP &R) {
20849 std::optional<unsigned> OperandIndex =
20851 if (!OperandIndex || R.isDeleted(LastInsertInst))
20853 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20855 BuildVectorOpds, InsertElts, *OperandIndex, R);
20858 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20859 InsertElts[*OperandIndex] = LastInsertInst;
20861 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20862 }
while (LastInsertInst !=
nullptr &&
20863 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20887 assert((isa<InsertElementInst>(LastInsertInst) ||
20888 isa<InsertValueInst>(LastInsertInst)) &&
20889 "Expected insertelement or insertvalue instruction!");
20892 "Expected empty result vectors!");
20895 if (!AggregateSize)
20897 BuildVectorOpds.
resize(*AggregateSize);
20898 InsertElts.
resize(*AggregateSize);
20904 if (BuildVectorOpds.
size() >= 2)
20922 auto DominatedReduxValue = [&](
Value *R) {
20923 return isa<Instruction>(R) &&
20924 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20930 if (
P->getIncomingBlock(0) == ParentBB) {
20931 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20932 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20933 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20936 if (Rdx && DominatedReduxValue(Rdx))
20949 if (
P->getIncomingBlock(0) == BBLatch) {
20950 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20951 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20952 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20955 if (Rdx && DominatedReduxValue(Rdx))
20989 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20990 isa<IntrinsicInst>(Root)) &&
20991 "Expected binop, select, or intrinsic for reduction matching");
20993 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20995 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20997 return dyn_cast<Instruction>(
RHS);
20999 return dyn_cast<Instruction>(
LHS);
21006 Value *Op0 =
nullptr;
21007 Value *Op1 =
nullptr;
21010 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21016 Value *B0 =
nullptr, *B1 =
nullptr;
21021bool SLPVectorizerPass::vectorizeHorReduction(
21026 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21028 if (Root->
getParent() != BB || isa<PHINode>(Root))
21032 auto SelectRoot = [&]() {
21051 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21052 Stack.emplace(SelectRoot(), 0);
21056 if (
R.isAnalyzedReductionRoot(Inst))
21061 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21063 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21065 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21066 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21073 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21078 while (!
Stack.empty()) {
21081 std::tie(Inst, Level) =
Stack.front();
21086 if (
R.isDeleted(Inst))
21088 if (
Value *VectorizedV = TryToReduce(Inst)) {
21090 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21092 Stack.emplace(
I, Level);
21095 if (
R.isDeleted(Inst))
21099 if (!TryAppendToPostponedInsts(Inst)) {
21110 if (VisitedInstrs.
insert(
Op).second)
21111 if (
auto *
I = dyn_cast<Instruction>(
Op))
21114 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21115 !
R.isDeleted(
I) &&
I->getParent() == BB)
21116 Stack.emplace(
I, Level);
21124 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21125 Res |= tryToVectorize(PostponedInsts, R);
21132 for (
Value *V : Insts)
21133 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21134 Res |= tryToVectorize(Inst, R);
21138bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21141 if (!
R.canMapToVector(IVI->
getType()))
21149 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21150 R.getORE()->emit([&]() {
21152 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21153 "trying reduction first.";
21157 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21159 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21169 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21173 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21174 R.getORE()->emit([&]() {
21176 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21177 "trying reduction first.";
21181 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21182 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21185template <
typename T>
21190 bool MaxVFOnly,
BoUpSLP &R) {
21191 bool Changed =
false;
21202 auto *
I = dyn_cast<Instruction>(*IncIt);
21203 if (!
I || R.isDeleted(
I)) {
21207 auto *SameTypeIt = IncIt;
21208 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21209 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21210 AreCompatible(*SameTypeIt, *IncIt))) {
21211 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21213 if (
I && !R.isDeleted(
I))
21218 unsigned NumElts = VL.
size();
21219 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21220 << NumElts <<
")\n");
21230 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21233 VL.
swap(Candidates);
21234 Candidates.
clear();
21236 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21242 auto GetMinNumElements = [&R](
Value *V) {
21243 unsigned EltSize = R.getVectorElementSize(V);
21244 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21246 if (NumElts < GetMinNumElements(*IncIt) &&
21247 (Candidates.
empty() ||
21248 Candidates.
front()->getType() == (*IncIt)->getType())) {
21250 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21256 if (Candidates.
size() > 1 &&
21257 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21258 if (TryToVectorizeHelper(Candidates,
false)) {
21261 }
else if (MaxVFOnly) {
21264 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21266 auto *
I = dyn_cast<Instruction>(*It);
21267 if (!
I || R.isDeleted(
I)) {
21271 auto *SameTypeIt = It;
21272 while (SameTypeIt !=
End &&
21273 (!isa<Instruction>(*SameTypeIt) ||
21274 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21275 AreCompatible(*SameTypeIt, *It))) {
21276 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21278 if (
I && !R.isDeleted(
I))
21281 unsigned NumElts = VL.
size();
21282 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21288 Candidates.
clear();
21292 IncIt = SameTypeIt;
21304template <
bool IsCompatibility>
21309 "Expected valid element types only.");
21311 return IsCompatibility;
21312 auto *CI1 = cast<CmpInst>(V);
21313 auto *CI2 = cast<CmpInst>(V2);
21314 if (CI1->getOperand(0)->getType()->getTypeID() <
21316 return !IsCompatibility;
21317 if (CI1->getOperand(0)->getType()->getTypeID() >
21320 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21322 return !IsCompatibility;
21323 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21332 if (BasePred1 < BasePred2)
21333 return !IsCompatibility;
21334 if (BasePred1 > BasePred2)
21337 bool CI1Preds = Pred1 == BasePred1;
21338 bool CI2Preds = Pred2 == BasePred1;
21339 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21340 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21341 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21345 return !IsCompatibility;
21348 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21349 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21350 if (IsCompatibility) {
21351 if (I1->getParent() != I2->getParent())
21358 return NodeI2 !=
nullptr;
21361 assert((NodeI1 == NodeI2) ==
21363 "Different nodes should have different DFS numbers");
21364 if (NodeI1 != NodeI2)
21368 if (S && (IsCompatibility || !S.isAltShuffle()))
21370 if (IsCompatibility)
21372 if (I1->getOpcode() != I2->getOpcode())
21373 return I1->getOpcode() < I2->getOpcode();
21376 return IsCompatibility;
21379template <
typename ItT>
21382 bool Changed =
false;
21385 if (
R.isDeleted(
I))
21388 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21389 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21390 if (
R.isDeleted(
I))
21396 if (
R.isDeleted(
I))
21398 Changed |= tryToVectorize(
I, R);
21405 return compareCmp<false>(V, V2, *TLI, *DT);
21408 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21411 return compareCmp<true>(V1, V2, *TLI, *DT);
21418 if (Vals.
size() <= 1)
21420 Changed |= tryToVectorizeSequence<Value>(
21421 Vals, CompareSorter, AreCompatibleCompares,
21424 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21426 auto *Select = dyn_cast<SelectInst>(U);
21428 Select->getParent() != cast<Instruction>(V)->getParent();
21431 if (ArePossiblyReducedInOtherBlock)
21433 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21439bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21441 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21442 "This function only accepts Insert instructions");
21443 bool OpsChanged =
false;
21445 for (
auto *
I :
reverse(Instructions)) {
21447 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21449 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21451 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21452 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21454 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21457 if (
R.isDeleted(
I))
21459 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21460 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21463 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21465 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21466 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21467 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21472 OpsChanged |= tryToVectorize(PostponedInsts, R);
21479 bool Changed =
false;
21486 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21489 "Expected vectorizable types only.");
21497 V2->getType()->getScalarSizeInBits())
21500 V2->getType()->getScalarSizeInBits())
21504 if (Opcodes1.
size() < Opcodes2.
size())
21506 if (Opcodes1.
size() > Opcodes2.
size())
21508 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21511 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21512 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21517 return NodeI2 !=
nullptr;
21520 assert((NodeI1 == NodeI2) ==
21522 "Different nodes should have different DFS numbers");
21523 if (NodeI1 != NodeI2)
21526 if (S && !S.isAltShuffle())
21528 return I1->getOpcode() < I2->getOpcode();
21537 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21538 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21546 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21547 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21551 auto ValID1 = Opcodes1[
I]->getValueID();
21552 auto ValID2 = Opcodes2[
I]->getValueID();
21553 if (ValID1 == ValID2)
21555 if (ValID1 < ValID2)
21557 if (ValID1 > ValID2)
21566 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21570 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21573 if (V1->getType() !=
V2->getType())
21577 if (Opcodes1.
size() != Opcodes2.
size())
21579 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21581 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21583 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21584 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21585 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21587 if (
I1->getParent() != I2->getParent())
21593 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21595 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21601 bool HaveVectorizedPhiNodes =
false;
21606 auto *
P = dyn_cast<PHINode>(&
I);
21612 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21625 if (!Opcodes.
empty())
21629 while (!Nodes.
empty()) {
21630 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21633 for (
Value *V :
PHI->incoming_values()) {
21634 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21635 Nodes.push_back(PHI1);
21643 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21644 Incoming, PHICompare, AreCompatiblePHIs,
21646 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21649 Changed |= HaveVectorizedPhiNodes;
21650 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21651 auto *
PHI = dyn_cast<PHINode>(
P.first);
21652 return !
PHI ||
R.isDeleted(
PHI);
21654 PHIToOpcodes.
clear();
21656 }
while (HaveVectorizedPhiNodes);
21658 VisitedInstrs.
clear();
21660 InstSetVector PostProcessInserts;
21664 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21665 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21666 if (VectorizeCmps) {
21667 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21668 PostProcessCmps.
clear();
21670 PostProcessInserts.clear();
21675 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21676 return PostProcessCmps.
contains(Cmp);
21677 return isa<InsertElementInst, InsertValueInst>(
I) &&
21678 PostProcessInserts.contains(
I);
21684 return I->use_empty() &&
21685 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21690 if (isa<ScalableVectorType>(It->getType()))
21694 if (
R.isDeleted(&*It))
21697 if (!VisitedInstrs.
insert(&*It).second) {
21698 if (HasNoUsers(&*It) &&
21699 VectorizeInsertsAndCmps(It->isTerminator())) {
21709 if (isa<DbgInfoIntrinsic>(It))
21713 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21715 if (
P->getNumIncomingValues() == 2) {
21718 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21727 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21732 if (BB ==
P->getIncomingBlock(
I) ||
21738 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21739 PI && !IsInPostProcessInstrs(PI)) {
21741 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21743 if (Res &&
R.isDeleted(
P)) {
21753 if (HasNoUsers(&*It)) {
21754 bool OpsChanged =
false;
21755 auto *
SI = dyn_cast<StoreInst>(It);
21765 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21766 SI->getValueOperand()->hasOneUse();
21768 if (TryToVectorizeRoot) {
21769 for (
auto *V : It->operand_values()) {
21772 if (
auto *VI = dyn_cast<Instruction>(V);
21773 VI && !IsInPostProcessInstrs(VI))
21775 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21782 VectorizeInsertsAndCmps(It->isTerminator());
21793 if (isa<InsertElementInst, InsertValueInst>(It))
21794 PostProcessInserts.insert(&*It);
21795 else if (isa<CmpInst>(It))
21796 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21803 auto Changed =
false;
21804 for (
auto &Entry : GEPs) {
21807 if (
Entry.second.size() < 2)
21810 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21811 <<
Entry.second.size() <<
".\n");
21819 return !R.isDeleted(GEP);
21821 if (It ==
Entry.second.end())
21823 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21824 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21825 if (MaxVecRegSize < EltSize)
21828 unsigned MaxElts = MaxVecRegSize / EltSize;
21829 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21830 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21843 Candidates.remove_if([&R](
Value *
I) {
21844 return R.isDeleted(cast<Instruction>(
I)) ||
21845 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21853 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21854 auto *GEPI = GEPList[
I];
21855 if (!Candidates.count(GEPI))
21858 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21859 auto *GEPJ = GEPList[J];
21861 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21862 Candidates.remove(GEPI);
21863 Candidates.remove(GEPJ);
21864 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21865 Candidates.remove(GEPJ);
21872 if (Candidates.
size() < 2)
21879 auto BundleIndex = 0
u;
21880 for (
auto *V : Candidates) {
21881 auto *
GEP = cast<GetElementPtrInst>(V);
21882 auto *GEPIdx =
GEP->idx_begin()->get();
21883 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21884 Bundle[BundleIndex++] = GEPIdx;
21896 Changed |= tryToVectorizeList(Bundle, R);
21902bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21903 bool Changed =
false;
21908 if (
V->getValueOperand()->getType()->getTypeID() <
21909 V2->getValueOperand()->getType()->getTypeID())
21911 if (
V->getValueOperand()->getType()->getTypeID() >
21912 V2->getValueOperand()->getType()->getTypeID())
21914 if (
V->getPointerOperandType()->getTypeID() <
21915 V2->getPointerOperandType()->getTypeID())
21917 if (
V->getPointerOperandType()->getTypeID() >
21918 V2->getPointerOperandType()->getTypeID())
21920 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21921 V2->getValueOperand()->getType()->getScalarSizeInBits())
21923 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21924 V2->getValueOperand()->getType()->getScalarSizeInBits())
21927 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21928 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21932 DT->
getNode(I2->getParent());
21933 assert(NodeI1 &&
"Should only process reachable instructions");
21934 assert(NodeI2 &&
"Should only process reachable instructions");
21935 assert((NodeI1 == NodeI2) ==
21937 "Different nodes should have different DFS numbers");
21938 if (NodeI1 != NodeI2)
21940 return I1->getOpcode() < I2->getOpcode();
21942 return V->getValueOperand()->getValueID() <
21943 V2->getValueOperand()->getValueID();
21955 isa<UndefValue>(
V2->getValueOperand()))
21958 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21959 if (
I1->getParent() != I2->getParent())
21964 isa<Constant>(
V2->getValueOperand()))
21967 V2->getValueOperand()->getValueID();
21972 for (
auto &Pair : Stores) {
21973 if (Pair.second.size() < 2)
21977 << Pair.second.size() <<
".\n");
21986 Pair.second.rend());
21987 Changed |= tryToVectorizeSequence<StoreInst>(
21988 ReversedStores, StoreSorter, AreCompatibleStores,
21990 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.