73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
107using namespace std::placeholders;
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115 "Controls which SLP graphs should be vectorized.");
119 cl::desc(
"Run the SLP vectorization passes"));
123 cl::desc(
"Enable vectorization for wider vector utilization"));
127 cl::desc(
"Only vectorize if you gain more than this "
132 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
137 cl::desc(
"Attempt to vectorize horizontal reductions"));
142 "Attempt to vectorize horizontal reductions feeding into a store"));
146 cl::desc(
"Attempt to vectorize for this register size in bits"));
150 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
158 cl::desc(
"Limit the size of the SLP scheduling region per block"));
162 cl::desc(
"Attempt to vectorize for this register size in bits"));
166 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
176 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
185 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189 cl::desc(
"The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
194 cl::desc(
"The maximum stride, considered to be profitable."));
198 cl::desc(
"Display the SLP trees with Graphviz"));
202 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
233 if (
SLPReVec && isa<FixedVectorType>(Ty))
235 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
244 if (
auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (
auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (
auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
272 Type *Ty,
unsigned Sz) {
277 if (NumParts == 0 || NumParts >= Sz)
292 if (NumParts == 0 || NumParts >= Sz)
297 return (Sz / RegVF) * RegVF;
307 for (
unsigned I : seq<unsigned>(Mask.size()))
309 I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
342 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344 auto *SV = cast<ShuffleVectorInst>(VL.
front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353 unsigned NumGroup = 0;
354 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356 Value *Src = SV->getOperand(0);
360 auto *SV = cast<ShuffleVectorInst>(V);
362 if (SV->getOperand(0) != Src)
365 if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371 if (!ExpectedIndex.
all())
375 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
393 auto *SV = cast<ShuffleVectorInst>(VL.
front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397 unsigned AccumulateLength = 0;
398 for (
Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421 auto *
I = dyn_cast<Instruction>(V);
422 if (!
I || isa<ExtractValueInst>(
I))
424 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426 if (isa<ExtractElementInst>(
I))
428 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
444 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
453 OS <<
"Idx: " <<
Idx <<
", ";
454 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
462 auto *It =
find_if(VL, IsaPred<Instruction>);
471 if (isa<PoisonValue>(V))
473 auto *
II = dyn_cast<Instruction>(V);
477 if (BB !=
II->getParent())
494 Value *FirstNonUndef =
nullptr;
495 for (
Value *V : VL) {
496 if (isa<UndefValue>(V))
498 if (!FirstNonUndef) {
502 if (V != FirstNonUndef)
505 return FirstNonUndef !=
nullptr;
510 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511 return Cmp->isCommutative();
512 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539 return I->isCommutative();
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549 if (
const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556 if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
570 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
581 Type *CurrentType =
IV->getType();
582 for (
unsigned I :
IV->indices()) {
583 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
621 if (MaskArg == UseMask::UndefsAsMask)
625 if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
636template <
bool IsPoisonOnly = false>
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646 auto *
C = dyn_cast<Constant>(V);
648 if (!UseMask.empty()) {
650 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652 if (isa<T>(
II->getOperand(1)))
659 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
675 if (
Constant *Elem =
C->getAggregateElement(
I))
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
705static std::optional<TargetTransformInfo::ShuffleKind>
708 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
722 Value *Vec1 =
nullptr;
723 Value *Vec2 =
nullptr;
725 auto *EE = dyn_cast<ExtractElementInst>(V);
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
738 if (isa<UndefValue>(VL[
I]))
740 auto *EI = cast<ExtractElementInst>(VL[
I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743 auto *Vec = EI->getVectorOperand();
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
748 if (isa<UndefValue>(Vec)) {
751 if (isa<UndefValue>(EI->getIndexOperand()))
753 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
759 unsigned IntIdx =
Idx->getValue().getZExtValue();
766 if (!Vec1 || Vec1 == Vec) {
768 }
else if (!Vec2 || Vec2 == Vec) {
774 if (CommonShuffleMode == Permute)
778 if (Mask[
I] %
Size !=
I) {
779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
785 if (CommonShuffleMode ==
Select && Vec2)
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803 return CI->getZExtValue();
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
808 return *EI->idx_begin();
814class InstructionsState {
821 assert(valid() &&
"InstructionsState is invalid.");
826 assert(valid() &&
"InstructionsState is invalid.");
831 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
833 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
836 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
839 unsigned CheckedOpcode =
I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
844 bool valid()
const {
return MainOp && AltOp; }
846 explicit operator bool()
const {
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
890 "Assessing comparisons of different types?");
900 return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
912 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
915 auto *It =
find_if(VL, IsaPred<Instruction>);
917 return InstructionsState::invalid();
920 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
932 unsigned AltOpcode = Opcode;
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
938 for (
Value *V : VL) {
939 auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946 if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
953 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
959 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963 return InstructionsState::invalid();
965 bool AnyPoison = InstCnt != VL.
size();
968 auto *
I = dyn_cast<Instruction>(V);
975 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode =
I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(
I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
983 AltOpcode = InstOpcode;
987 }
else if (IsCastOp && isa<CastInst>(
I)) {
990 Value *Op1 =
I->getOperand(0);
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995 if (Opcode == AltOpcode) {
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1004 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1009 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1019 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1029 }
else if (BasePred != CurrentPred) {
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1041 }
else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1045 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1046 if (Gep->getNumOperands() != 2 ||
1048 return InstructionsState::invalid();
1049 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1051 return InstructionsState::invalid();
1052 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1057 auto *
CallBase = cast<CallInst>(MainOp);
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1066 return InstructionsState::invalid();
1069 return InstructionsState::invalid();
1072 if (Mappings.
size() != BaseMappings.
size() ||
1073 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1074 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1075 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1076 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1077 Mappings.
front().Shape.Parameters !=
1078 BaseMappings.
front().Shape.Parameters)
1079 return InstructionsState::invalid();
1084 return InstructionsState::invalid();
1087 return InstructionsState(MainOp, AltOp);
1104 unsigned Opcode = UserInst->
getOpcode();
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1131 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1138 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1139 return LI->isSimple();
1141 return SI->isSimple();
1143 return !
MI->isVolatile();
1151 bool ExtendingManyInputs =
false) {
1152 if (SubMask.
empty())
1155 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1158 "SubMask with many inputs support must be larger than the mask.");
1160 Mask.append(SubMask.
begin(), SubMask.
end());
1164 int TermValue = std::min(Mask.size(), SubMask.
size());
1165 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1167 (!ExtendingManyInputs &&
1168 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1170 NewMask[
I] = Mask[SubMask[
I]];
1186 const unsigned Sz = Order.
size();
1189 for (
unsigned I = 0;
I < Sz; ++
I) {
1191 UnusedIndices.
reset(Order[
I]);
1193 MaskedIndices.
set(
I);
1195 if (MaskedIndices.
none())
1198 "Non-synced masked/available indices.");
1202 assert(
Idx >= 0 &&
"Indices must be synced.");
1213 Type *ScalarTy = VL[0]->getType();
1216 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1219 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1220 OpcodeMask.
set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1231 const unsigned E = Indices.
size();
1233 for (
unsigned I = 0;
I < E; ++
I)
1234 Mask[Indices[
I]] =
I;
1240 assert(!Mask.empty() &&
"Expected non-empty mask.");
1244 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1246 Scalars[Mask[
I]] = Prev[
I];
1254 auto *
I = dyn_cast<Instruction>(V);
1259 auto *IO = dyn_cast<Instruction>(V);
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1271 auto *
I = dyn_cast<Instruction>(V);
1275 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1277 auto *IU = dyn_cast<Instruction>(U);
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1296 return !VL.
empty() &&
1312 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1316namespace slpvectorizer {
1321 struct ScheduleData;
1345 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1397 return !VectorizableTree.
empty() &&
1398 !VectorizableTree.
front()->UserTreeIndices.empty();
1403 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1404 return VectorizableTree.
front()->Scalars;
1410 const TreeEntry &Root = *VectorizableTree.
front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.
find(&Root);
1415 if (It != MinBWs.
end())
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1429 return MinBWs.
at(VectorizableTree.
front().get()).second;
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.
front()->Scalars.front()->getType()))
1440 VectorizableTree.
front()->Scalars.front()->getType(),
1441 VectorizableTree.
front()->getVectorFactor());
1444 VectorizableTree.
front()->Scalars.front()->getContext(),
1446 VectorizableTree.
front()->getVectorFactor());
1461 VectorizableTree.
clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1465 NonScheduledFirst.
clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.
clear();
1468 IsGraphTransformMode =
false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.
clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (
auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1477 ReductionBitWidth = 0;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.
clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList =
nullptr;
1483 PostponedGathers.
clear();
1484 ValueToGatherNodes.
clear();
1500 assert(!Order.
empty() &&
"expected non-empty order");
1501 const unsigned Sz = Order.
size();
1503 return P.value() ==
P.index() ||
P.value() == Sz;
1556 return MaxVecRegSize;
1561 return MinVecRegSize;
1569 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1571 return MaxVF ? MaxVF : UINT_MAX;
1623 unsigned *BestVF =
nullptr,
1624 bool TryRecursiveCheck =
true)
const;
1632 template <
typename T>
1659 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1660 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1682 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1737 if (isa<LoadInst>(V1)) {
1739 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1744 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1754 ((
int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1773 return CheckSameEntryOrFail();
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(),
DL, SE,
true);
1778 if (!Dist || *Dist == 0) {
1781 R.TTI->isLegalMaskedGather(
1784 return CheckSameEntryOrFail();
1788 if (std::abs(*Dist) > NumLanes / 2)
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1812 if (isa<UndefValue>(V2))
1816 Value *EV2 =
nullptr;
1829 int Dist = Idx2 - Idx1;
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1841 return CheckSameEntryOrFail();
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1857 !S.isAltShuffle()) &&
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1867 if (I1 && isa<PoisonValue>(V2))
1870 if (isa<UndefValue>(V2))
1873 return CheckSameEntryOrFail();
1907 int ShallowScoreAtThisLevel =
1916 auto *I1 = dyn_cast<Instruction>(
LHS);
1917 auto *I2 = dyn_cast<Instruction>(
RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 &&
"Should have early exited.");
1932 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest =
false;
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx &&
"Bad index");
1944 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1946 if (Op2Used.
count(OpIdx2))
1951 I1, I2, CurrLevel + 1, {});
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1962 Op2Used.
insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1966 return ShallowScoreAtThisLevel;
1997 struct OperandData {
1998 OperandData() =
default;
1999 OperandData(
Value *V,
bool APO,
bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2010 bool IsUsed =
false;
2019 enum class ReorderingMode {
2033 unsigned ArgSize = 0;
2039 const Loop *L =
nullptr;
2042 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2047 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2048 return OpsVec[OpIdx][Lane];
2053 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2057 OpsVec[OpIdx][Lane].IsUsed =
false;
2061 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2074 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2076 Value *IdxLaneV = getData(
Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2081 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2089 unsigned UniquesCount = Uniques.
size();
2090 auto IdxIt = Uniques.
find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2099 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2103 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2105 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2114 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2115 Value *IdxLaneV = getData(
Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2128 return R.areAllUsersVectorized(IdxLaneI)
2136 static const int ScoreScaleFactor = 10;
2144 int Lane,
unsigned OpIdx,
unsigned Idx,
2154 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2159 Score += SplatScore;
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2184 std::optional<unsigned>
2185 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2189 unsigned NumOperands = getNumOperands();
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2206 std::optional<unsigned>
Idx;
2210 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2220 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2222 OperandData &OpData = getData(
Idx, Lane);
2224 bool OpAPO = OpData.APO;
2233 if (OpAPO != OpIdxAPO)
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2242 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx,
Idx, IsUsed, UsedLanes);
2245 if (Score >
static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(
Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2258 if (isa<Constant>(
Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2263 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2267 case ReorderingMode::Splat:
2268 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2269 IsUsed =
Op == OpLastLane;
2270 if (
Op == OpLastLane) {
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2278 case ReorderingMode::Failed:
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2288 return std::nullopt;
2295 unsigned getBestLaneToStartReordering()
const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2308 for (
int I = getNumLanes();
I > 0; --
I) {
2309 unsigned Lane =
I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2337 if (
Data.second.first < CntMin) {
2338 CntMin =
Data.second.first;
2339 BestLane =
Data.second.second;
2346 struct OperandsOrderData {
2349 unsigned NumOfAPOs = UINT_MAX;
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2378 bool AllUndefs =
true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2383 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2389 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2395 Parent =
I->getParent();
2397 --NumOpsWithSameOpcodeParent;
2400 ++NumOpsWithSameOpcodeParent;
2404 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2409 OperandsOrderData
Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2419 assert((empty() || VL.
size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2421 assert(S.valid() &&
"InstructionsState is invalid.");
2424 constexpr unsigned IntrinsicNumOperands = 2;
2427 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.
resize(NumOperands);
2429 unsigned NumLanes = VL.
size();
2430 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].
resize(NumLanes);
2432 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434 "Expected instruction or poison value");
2445 if (isa<PoisonValue>(VL[Lane])) {
2446 if (
auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2448 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),
true,
false};
2451 }
else if (
auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2453 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),
true,
false};
2457 OpsVec[OpIdx][Lane] = {
2462 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2463 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2464 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2471 unsigned getNumOperands()
const {
return ArgSize; }
2474 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2477 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2478 return getData(OpIdx, Lane).V;
2482 bool empty()
const {
return OpsVec.
empty(); }
2485 void clear() { OpsVec.
clear(); }
2490 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2491 assert(
Op == getValue(OpIdx, Lane) &&
2492 "Op is expected to be getValue(OpIdx, Lane).");
2494 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2496 bool OpAPO = getData(OpIdx, Lane).APO;
2497 bool IsInvariant = L && L->isLoopInvariant(
Op);
2499 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2503 bool FoundCandidate =
false;
2504 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2505 OperandData &
Data = getData(OpI, Ln);
2506 if (
Data.APO != OpAPO ||
Data.IsUsed)
2508 Value *OpILane = getValue(OpI, Lane);
2509 bool IsConstantOp = isa<Constant>(OpILane);
2518 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2523 isa<Constant>(
Data.V)))) ||
2530 (IsInvariant && !isa<Constant>(
Data.V) &&
2532 L->isLoopInvariant(
Data.V))) {
2533 FoundCandidate =
true;
2540 if (!FoundCandidate)
2543 return getNumLanes() == 2 || Cnt > 1;
2548 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2549 assert(
Op == getValue(OpIdx, Lane) &&
2550 "Op is expected to be getValue(OpIdx, Lane).");
2551 bool OpAPO = getData(OpIdx, Lane).APO;
2552 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2555 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2556 const OperandData &
Data = getData(OpI, Ln);
2557 if (
Data.APO != OpAPO ||
Data.IsUsed)
2559 Value *OpILn = getValue(OpI, Ln);
2560 return (L && L->isLoopInvariant(OpILn)) ||
2573 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2574 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
2576 appendOperandsOfVL(RootVL, S);
2583 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2584 "Expected same num of lanes across all operands");
2585 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2586 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2594 unsigned NumOperands = getNumOperands();
2595 unsigned NumLanes = getNumLanes();
2615 unsigned FirstLane = getBestLaneToStartReordering();
2618 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2619 Value *OpLane0 = getValue(OpIdx, FirstLane);
2622 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2624 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2625 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2626 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627 else if (isa<LoadInst>(OpILane0))
2628 ReorderingModes[OpIdx] = ReorderingMode::Load;
2630 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2631 }
else if (isa<Constant>(OpLane0)) {
2632 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2633 }
else if (isa<Argument>(OpLane0)) {
2635 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2645 auto &&SkipReordering = [
this]() {
2648 for (
const OperandData &
Data : Op0)
2652 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2659 return UniqueValues.
size() != 2 &&
2661 UniqueValues.
size());
2673 if (SkipReordering())
2676 bool StrategyFailed =
false;
2684 for (
unsigned I = 0;
I < NumOperands; ++
I)
2685 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2688 UsedLanes.
set(FirstLane);
2689 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2692 int Lane = FirstLane +
Direction * Distance;
2693 if (Lane < 0 || Lane >= (
int)NumLanes)
2695 UsedLanes.
set(Lane);
2697 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2700 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2702 std::optional<unsigned> BestIdx =
2703 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2704 MainAltOps[OpIdx], UsedLanes);
2711 swap(OpIdx, *BestIdx, Lane);
2714 StrategyFailed =
true;
2717 if (MainAltOps[OpIdx].
size() != 2) {
2718 OperandData &AltOp = getData(OpIdx, Lane);
2719 InstructionsState OpS =
2721 if (OpS && OpS.isAltShuffle())
2728 if (!StrategyFailed)
2733#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2736 case ReorderingMode::Load:
2738 case ReorderingMode::Opcode:
2740 case ReorderingMode::Constant:
2742 case ReorderingMode::Splat:
2744 case ReorderingMode::Failed:
2765 const unsigned Indent = 2;
2768 OS <<
"Operand " << Cnt++ <<
"\n";
2769 for (
const OperandData &OpData : OpDataVec) {
2771 if (
Value *V = OpData.V)
2775 OS <<
", APO:" << OpData.APO <<
"}\n";
2797 int BestScore = Limit;
2798 std::optional<int> Index;
2799 for (
int I : seq<int>(0, Candidates.size())) {
2801 Candidates[
I].second,
2804 if (Score > BestScore) {
2819 DeletedInstructions.insert(
I);
2824 template <
typename T>
2827 for (
T *V : DeadVals) {
2828 auto *
I = cast<Instruction>(V);
2829 DeletedInstructions.insert(
I);
2832 for (
T *V : DeadVals) {
2833 if (!V || !Processed.
insert(V).second)
2835 auto *
I = cast<Instruction>(V);
2838 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2839 Entries.push_back(Entry);
2840 auto It = MultiNodeScalars.find(
I);
2841 if (It != MultiNodeScalars.end())
2842 Entries.append(It->second.begin(), It->second.end());
2844 for (
Use &U :
I->operands()) {
2845 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2846 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2848 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2849 return Entry->VectorizedValue == OpI;
2853 I->dropAllReferences();
2855 for (
T *V : DeadVals) {
2856 auto *
I = cast<Instruction>(V);
2857 if (!
I->getParent())
2862 cast<Instruction>(U.getUser()));
2864 "trying to erase instruction with users.");
2865 I->removeFromParent();
2869 while (!DeadInsts.
empty()) {
2872 if (!VI || !VI->getParent())
2875 "Live instruction found in dead worklist!");
2876 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2883 for (
Use &OpU : VI->operands()) {
2884 Value *OpV = OpU.get();
2895 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2896 if (!DeletedInstructions.contains(OpI) &&
2901 VI->removeFromParent();
2902 DeletedInstructions.insert(VI);
2910 return AnalyzedReductionsRoots.count(
I);
2915 AnalyzedReductionsRoots.insert(
I);
2929 AnalyzedReductionsRoots.clear();
2930 AnalyzedReductionVals.
clear();
2931 AnalyzedMinBWVals.
clear();
2943 return NonScheduledFirst.
contains(V);
2956 bool collectValuesToDemote(
2957 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2960 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2970 canReorderOperands(TreeEntry *UserTE,
2977 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2981 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2983 TreeEntry *TE =
nullptr;
2985 TE = getTreeEntry(V);
2986 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2988 auto It = MultiNodeScalars.find(V);
2989 if (It != MultiNodeScalars.end()) {
2990 for (TreeEntry *E : It->second) {
2991 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2999 if (It != VL.
end()) {
3000 assert(
TE->isSame(VL) &&
"Expected same scalars.");
3008 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
3009 unsigned OpIdx)
const {
3010 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
3011 const_cast<TreeEntry *
>(UserTE), OpIdx);
3015 bool areAllUsersVectorized(
3024 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3029 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3033 getCastContextHint(
const TreeEntry &TE)
const;
3042 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3053 bool ResizeAllowed =
false)
const;
3062 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3063 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3064 unsigned NodeIdx)
const {
3065 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3072 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3077 template <
typename BVTy,
typename ResTy,
typename...
Args>
3078 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3083 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3084 bool PostponedPHIs);
3090 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3097 std::optional<TargetTransformInfo::ShuffleKind>
3109 unsigned NumParts)
const;
3121 std::optional<TargetTransformInfo::ShuffleKind>
3122 isGatherShuffledSingleRegisterEntry(
3139 isGatherShuffledEntry(
3142 unsigned NumParts,
bool ForOrder =
false);
3148 Type *ScalarTy)
const;
3152 void setInsertPointAfterBundle(
const TreeEntry *E);
3162 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3167 void tryToVectorizeGatheredLoads(
3176 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3192 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3196 void reorderGatherNode(TreeEntry &TE);
3200 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3217 [Scalars](
Value *V,
int Idx) {
3218 return (isa<UndefValue>(V) &&
3219 Idx == PoisonMaskElem) ||
3220 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3223 if (!ReorderIndices.empty()) {
3230 return IsSame(Scalars, Mask);
3231 if (VL.
size() == ReuseShuffleIndices.size()) {
3233 return IsSame(Scalars, Mask);
3237 return IsSame(Scalars, ReuseShuffleIndices);
3240 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3241 return isGather() && !UserTreeIndices.empty() &&
3242 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3243 UserTreeIndices.front().UserTE == UserEI.UserTE;
3247 bool hasEqualOperands(
const TreeEntry &TE)
const {
3248 if (
TE.getNumOperands() != getNumOperands())
3251 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3252 unsigned PrevCount =
Used.count();
3253 for (
unsigned K = 0;
K < E; ++
K) {
3256 if (getOperand(K) ==
TE.getOperand(
I)) {
3262 if (PrevCount ==
Used.count())
3271 unsigned getVectorFactor()
const {
3272 if (!ReuseShuffleIndices.empty())
3273 return ReuseShuffleIndices.size();
3274 return Scalars.
size();
3278 bool isGather()
const {
return State == NeedToGather; }
3305 enum CombinedOpcode {
3307 MinMax = Instruction::OtherOpsEnd + 1,
3309 CombinedOpcode CombinedOp = NotCombinedOp;
3323 VecTreeTy &Container;
3344 InstructionsState S = InstructionsState::invalid();
3347 unsigned InterleaveFactor = 0;
3351 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3353 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3359 assert(Operands[OpIdx].empty() &&
"Already resized?");
3361 "Number of operands is greater than the number of scalars.");
3367 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3368 VLOperands Ops(Scalars, S, R);
3371 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3372 setOperand(
I, Ops.getVL(
I));
3394 unsigned getNumOperands()
const {
return Operands.size(); }
3397 Value *getSingleOperand(
unsigned OpIdx)
const {
3399 assert(!Operands[OpIdx].empty() &&
"No operand available");
3404 bool isAltShuffle()
const {
return S.isAltShuffle(); }
3406 bool isOpcodeOrAlt(
Instruction *
I)
const {
return S.isOpcodeOrAlt(
I); }
3412 auto *
I = dyn_cast<Instruction>(
Op);
3413 if (
I && isOpcodeOrAlt(
I))
3415 return S.getMainOp();
3418 void setOperations(
const InstructionsState &S) {
3419 assert(S &&
"InstructionsState is invalid.");
3423 Instruction *getMainOp()
const {
return S.getMainOp(); }
3425 Instruction *getAltOp()
const {
return S.getAltOp(); }
3428 unsigned getOpcode()
const {
return S.
getOpcode(); }
3430 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
3432 bool hasState()
const {
return S.valid(); }
3436 int findLaneForValue(
Value *V)
const {
3437 unsigned FoundLane = getVectorFactor();
3438 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3439 std::advance(It, 1)) {
3442 FoundLane = std::distance(Scalars.begin(), It);
3443 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3444 if (!ReorderIndices.
empty())
3445 FoundLane = ReorderIndices[FoundLane];
3446 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3447 if (ReuseShuffleIndices.
empty())
3449 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3450 RIt != ReuseShuffleIndices.
end()) {
3451 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3455 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3468 bool isNonPowOf2Vec()
const {
3470 return IsNonPowerOf2;
3479 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3480 "Reshuffling not supported with non-power-of-2 vectors yet.");
3481 return IsNonPowerOf2;
3484 Value *getOrdered(
unsigned Idx)
const {
3485 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3486 if (ReorderIndices.
empty())
3487 return Scalars[
Idx];
3497 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3498 dbgs() <<
"Operand " << OpI <<
":\n";
3499 for (
const Value *V : Operands[OpI])
3502 dbgs() <<
"Scalars: \n";
3503 for (
Value *V : Scalars)
3505 dbgs() <<
"State: ";
3508 if (InterleaveFactor > 0) {
3509 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3512 dbgs() <<
"Vectorize\n";
3515 case ScatterVectorize:
3516 dbgs() <<
"ScatterVectorize\n";
3518 case StridedVectorize:
3519 dbgs() <<
"StridedVectorize\n";
3522 dbgs() <<
"NeedToGather\n";
3524 case CombinedVectorize:
3525 dbgs() <<
"CombinedVectorize\n";
3529 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
3530 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
3532 dbgs() <<
"MainOp: NULL\n";
3533 dbgs() <<
"AltOp: NULL\n";
3535 dbgs() <<
"VectorizedValue: ";
3536 if (VectorizedValue)
3537 dbgs() << *VectorizedValue <<
"\n";
3540 dbgs() <<
"ReuseShuffleIndices: ";
3541 if (ReuseShuffleIndices.
empty())
3544 for (
int ReuseIdx : ReuseShuffleIndices)
3545 dbgs() << ReuseIdx <<
", ";
3547 dbgs() <<
"ReorderIndices: ";
3548 for (
unsigned ReorderIdx : ReorderIndices)
3549 dbgs() << ReorderIdx <<
", ";
3551 dbgs() <<
"UserTreeIndices: ";
3552 for (
const auto &EInfo : UserTreeIndices)
3553 dbgs() << EInfo <<
", ";
3555 if (!CombinedEntriesWithIndices.
empty()) {
3556 dbgs() <<
"Combined entries: ";
3558 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3567 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3570 dbgs() <<
"SLP: " << Banner <<
":\n";
3572 dbgs() <<
"SLP: Costs:\n";
3573 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3574 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3575 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3576 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3577 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3583 std::optional<ScheduleData *> Bundle,
3584 const InstructionsState &S,
3585 const EdgeInfo &UserTreeIdx,
3588 unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593 if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601 const InstructionsState &S,
3602 const EdgeInfo &UserTreeIdx,
3605 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607 "Need to vectorize gather entry?");
3609 if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3614 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *
Last = VectorizableTree.
back().get();
3616 Last->Idx = VectorizableTree.
size() - 1;
3617 Last->State = EntryState;
3622 ReuseShuffleIndices.empty()) &&
3623 "Reshuffling scalars not yet supported for nodes with padding");
3624 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626 if (ReorderIndices.
empty()) {
3629 Last->setOperations(S);
3632 Last->Scalars.assign(VL.
size(),
nullptr);
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3641 Last->setOperations(S);
3642 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3644 if (!
Last->isGather()) {
3645 for (
Value *V : VL) {
3646 const TreeEntry *
TE = getTreeEntry(V);
3648 "Scalar already in tree!");
3651 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3654 ScalarToTreeEntry[
V] =
Last;
3657 ScheduleData *BundleMember = *Bundle;
3658 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3661 "Bundle and VL out of sync");
3663 for (
Value *V : VL) {
3668 BundleMember->TE =
Last;
3669 BundleMember = BundleMember->NextInBundle;
3672 assert(!BundleMember &&
"Bundle and VL out of sync");
3675 bool AllConstsOrCasts =
true;
3678 auto *
I = dyn_cast<CastInst>(V);
3679 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3680 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3681 !UserTreeIdx.UserTE->isGather())
3684 if (AllConstsOrCasts)
3686 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3687 MustGather.
insert(VL.begin(), VL.end());
3690 if (UserTreeIdx.UserTE)
3691 Last->UserTreeIndices.push_back(UserTreeIdx);
3697 TreeEntry::VecTreeTy VectorizableTree;
3702 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3703 VectorizableTree[
Id]->dump();
3709 TreeEntry *getTreeEntry(
Value *V) {
3710 assert(V &&
"V cannot be nullptr.");
3711 return ScalarToTreeEntry.lookup(V);
3714 const TreeEntry *getTreeEntry(
Value *V)
const {
3715 assert(V &&
"V cannot be nullptr.");
3716 return ScalarToTreeEntry.lookup(V);
3725 bool areAltOperandsProfitable(
const InstructionsState &S,
3730 TreeEntry::EntryState
3732 bool IsScatterVectorizeUserTE,
3765 using ValueToGatherNodesMap =
3767 ValueToGatherNodesMap ValueToGatherNodes;
3775 bool IsGraphTransformMode =
false;
3778 std::optional<unsigned> GatheredLoadsEntriesFirst;
3781 struct ExternalUser {
3805 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3806 auto It = AliasCache.
find(Key);
3807 if (It != AliasCache.
end())
3812 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3816 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3848 UserList ExternalUses;
3871 struct ScheduleData {
3874 enum { InvalidDeps = -1 };
3876 ScheduleData() =
default;
3879 FirstInBundle =
this;
3880 NextInBundle =
nullptr;
3881 NextLoadStore =
nullptr;
3882 IsScheduled =
false;
3883 SchedulingRegionID = BlockSchedulingRegionID;
3884 clearDependencies();
3891 if (hasValidDependencies()) {
3892 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3894 assert(UnscheduledDeps == Dependencies &&
"invariant");
3898 assert(isSchedulingEntity() &&
3899 "unexpected scheduled state");
3900 for (
const ScheduleData *BundleMember =
this; BundleMember;
3901 BundleMember = BundleMember->NextInBundle) {
3902 assert(BundleMember->hasValidDependencies() &&
3903 BundleMember->UnscheduledDeps == 0 &&
3904 "unexpected scheduled state");
3905 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3906 "only bundle is marked scheduled");
3910 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3911 "all bundle members must be in same basic block");
3917 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3921 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3925 bool isPartOfBundle()
const {
3926 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3931 bool isReady()
const {
3932 assert(isSchedulingEntity() &&
3933 "can't consider non-scheduling entity for ready list");
3934 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3940 int incrementUnscheduledDeps(
int Incr) {
3941 assert(hasValidDependencies() &&
3942 "increment of unscheduled deps would be meaningless");
3943 UnscheduledDeps += Incr;
3944 return FirstInBundle->unscheduledDepsInBundle();
3949 void resetUnscheduledDeps() {
3950 UnscheduledDeps = Dependencies;
3954 void clearDependencies() {
3955 Dependencies = InvalidDeps;
3956 resetUnscheduledDeps();
3957 MemoryDependencies.clear();
3958 ControlDependencies.clear();
3961 int unscheduledDepsInBundle()
const {
3962 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3964 for (
const ScheduleData *BundleMember =
this; BundleMember;
3965 BundleMember = BundleMember->NextInBundle) {
3966 if (BundleMember->UnscheduledDeps == InvalidDeps)
3968 Sum += BundleMember->UnscheduledDeps;
3974 if (!isSchedulingEntity()) {
3975 os <<
"/ " << *Inst;
3976 }
else if (NextInBundle) {
3978 ScheduleData *SD = NextInBundle;
3980 os <<
';' << *SD->Inst;
3981 SD = SD->NextInBundle;
3994 TreeEntry *
TE =
nullptr;
3998 ScheduleData *FirstInBundle =
nullptr;
4002 ScheduleData *NextInBundle =
nullptr;
4006 ScheduleData *NextLoadStore =
nullptr;
4020 int SchedulingRegionID = 0;
4023 int SchedulingPriority = 0;
4029 int Dependencies = InvalidDeps;
4035 int UnscheduledDeps = InvalidDeps;
4039 bool IsScheduled =
false;
4044 const BoUpSLP::ScheduleData &SD) {
4069 struct BlockScheduling {
4071 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4075 ScheduleStart =
nullptr;
4076 ScheduleEnd =
nullptr;
4077 FirstLoadStoreInRegion =
nullptr;
4078 LastLoadStoreInRegion =
nullptr;
4079 RegionHasStackSave =
false;
4083 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4086 ScheduleRegionSize = 0;
4090 ++SchedulingRegionID;
4094 if (BB !=
I->getParent())
4097 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4098 if (SD && isInSchedulingRegion(SD))
4103 ScheduleData *getScheduleData(
Value *V) {
4104 if (
auto *
I = dyn_cast<Instruction>(V))
4105 return getScheduleData(
I);
4109 bool isInSchedulingRegion(ScheduleData *SD)
const {
4110 return SD->SchedulingRegionID == SchedulingRegionID;
4115 template <
typename ReadyListType>
4116 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4117 SD->IsScheduled =
true;
4120 for (ScheduleData *BundleMember = SD; BundleMember;
4121 BundleMember = BundleMember->NextInBundle) {
4126 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4127 ScheduleData *OpDef = getScheduleData(
I);
4128 if (OpDef && OpDef->hasValidDependencies() &&
4129 OpDef->incrementUnscheduledDeps(-1) == 0) {
4133 ScheduleData *DepBundle = OpDef->FirstInBundle;
4134 assert(!DepBundle->IsScheduled &&
4135 "already scheduled bundle gets ready");
4136 ReadyList.insert(DepBundle);
4138 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4145 if (TreeEntry *TE = BundleMember->TE) {
4147 auto *
In = BundleMember->Inst;
4148 int Lane = std::distance(
TE->Scalars.begin(),
4150 assert(Lane >= 0 &&
"Lane not set");
4160 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4161 In->getNumOperands() ==
TE->getNumOperands()) &&
4162 "Missed TreeEntry operands?");
4164 for (
unsigned OpIdx : seq<unsigned>(
TE->getNumOperands()))
4165 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4170 for (
Use &U : BundleMember->Inst->operands())
4171 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4175 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4176 if (MemoryDepSD->hasValidDependencies() &&
4177 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4180 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4181 assert(!DepBundle->IsScheduled &&
4182 "already scheduled bundle gets ready");
4183 ReadyList.insert(DepBundle);
4185 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4189 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4190 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4193 ScheduleData *DepBundle = DepSD->FirstInBundle;
4194 assert(!DepBundle->IsScheduled &&
4195 "already scheduled bundle gets ready");
4196 ReadyList.insert(DepBundle);
4198 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4209 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4210 ScheduleStart->comesBefore(ScheduleEnd) &&
4211 "Not a valid scheduling region?");
4213 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4214 auto *SD = getScheduleData(
I);
4217 assert(isInSchedulingRegion(SD) &&
4218 "primary schedule data not in window?");
4219 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4220 "entire bundle in window!");
4224 for (
auto *SD : ReadyInsts) {
4225 assert(SD->isSchedulingEntity() && SD->isReady() &&
4226 "item in ready list not ready?");
4232 template <
typename ReadyListType>
4233 void initialFillReadyList(ReadyListType &ReadyList) {
4234 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4235 ScheduleData *SD = getScheduleData(
I);
4236 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4238 ReadyList.insert(SD);
4240 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4254 std::optional<ScheduleData *>
4256 const InstructionsState &S);
4262 ScheduleData *allocateScheduleDataChunks();
4266 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4271 ScheduleData *PrevLoadStore,
4272 ScheduleData *NextLoadStore);
4276 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4280 void resetSchedule();
4310 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4314 ScheduleData *LastLoadStoreInRegion =
nullptr;
4319 bool RegionHasStackSave =
false;
4322 int ScheduleRegionSize = 0;
4331 int SchedulingRegionID = 1;
4339 void scheduleBlock(BlockScheduling *BS);
4346 struct OrdersTypeDenseMapInfo {
4359 static unsigned getHashValue(
const OrdersType &V) {
4380 unsigned MaxVecRegSize;
4381 unsigned MinVecRegSize;
4396 unsigned ReductionBitWidth = 0;
4399 unsigned BaseGraphSize = 1;
4403 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4422 struct ChildIteratorType
4424 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4435 return R.VectorizableTree[0].get();
4439 return {
N->UserTreeIndices.begin(),
N->Container};
4443 return {
N->UserTreeIndices.end(),
N->Container};
4448 class nodes_iterator {
4459 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4463 return nodes_iterator(R->VectorizableTree.begin());
4467 return nodes_iterator(R->VectorizableTree.end());
4470 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4481 OS << Entry->Idx <<
".\n";
4484 for (
auto *V : Entry->Scalars) {
4486 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4487 return EU.Scalar == V;
4497 if (Entry->isGather())
4499 if (Entry->State == TreeEntry::ScatterVectorize ||
4500 Entry->State == TreeEntry::StridedVectorize)
4501 return "color=blue";
4510 for (
auto *
I : DeletedInstructions) {
4511 if (!
I->getParent()) {
4514 if (isa<PHINode>(
I))
4516 I->insertBefore(
F->getEntryBlock(),
4517 F->getEntryBlock().getFirstNonPHIIt());
4519 I->insertBefore(
F->getEntryBlock().getTerminator()->getIterator());
4522 for (
Use &U :
I->operands()) {
4523 auto *
Op = dyn_cast<Instruction>(U.get());
4524 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4528 I->dropAllReferences();
4530 for (
auto *
I : DeletedInstructions) {
4532 "trying to erase instruction with users.");
4533 I->eraseFromParent();
4539#ifdef EXPENSIVE_CHECKS
4550 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4551 "Expected non-empty mask.");
4554 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4556 Reuses[Mask[
I]] = Prev[
I];
4564 bool BottomOrder =
false) {
4565 assert(!Mask.empty() &&
"Expected non-empty mask.");
4566 unsigned Sz = Mask.size();
4569 if (Order.
empty()) {
4571 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4573 PrevOrder.
swap(Order);
4576 for (
unsigned I = 0;
I < Sz; ++
I)
4578 Order[
I] = PrevOrder[Mask[
I]];
4580 return Data.value() == Sz ||
Data.index() ==
Data.value();
4589 if (Order.
empty()) {
4591 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4601 for (
unsigned I = 0;
I < Sz; ++
I)
4603 Order[MaskOrder[
I]] =
I;
4607std::optional<BoUpSLP::OrdersType>
4609 assert(TE.isGather() &&
"Expected gather node only.");
4613 Type *ScalarTy = GatheredScalars.
front()->getType();
4614 int NumScalars = GatheredScalars.
size();
4616 return std::nullopt;
4619 if (NumParts == 0 || NumParts >= NumScalars ||
4620 VecTy->getNumElements() % NumParts != 0 ||
4622 VecTy->getNumElements() / NumParts))
4628 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4630 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4633 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4634 return std::nullopt;
4635 OrdersType CurrentOrder(NumScalars, NumScalars);
4636 if (GatherShuffles.
size() == 1 &&
4638 Entries.front().front()->isSame(TE.Scalars)) {
4641 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4642 return CurrentOrder;
4646 return all_of(Mask, [&](
int I) {
4653 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4654 (Entries.size() != 1 ||
4655 Entries.front().front()->ReorderIndices.empty())) ||
4656 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4657 return std::nullopt;
4662 for (
int I : seq<int>(0, NumParts)) {
4663 if (ShuffledSubMasks.
test(
I))
4665 const int VF = GetVF(
I);
4671 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4672 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4673 ShuffledSubMasks.
set(
I);
4677 int FirstMin = INT_MAX;
4678 int SecondVecFound =
false;
4679 for (
int K : seq<int>(Limit)) {
4680 int Idx = Mask[
I * PartSz + K];
4682 Value *V = GatheredScalars[
I * PartSz + K];
4684 SecondVecFound =
true;
4693 SecondVecFound =
true;
4697 FirstMin = (FirstMin / PartSz) * PartSz;
4699 if (SecondVecFound) {
4700 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4701 ShuffledSubMasks.
set(
I);
4704 for (
int K : seq<int>(Limit)) {
4705 int Idx = Mask[
I * PartSz + K];
4709 if (
Idx >= PartSz) {
4710 SecondVecFound =
true;
4713 if (CurrentOrder[
I * PartSz +
Idx] >
4714 static_cast<unsigned>(
I * PartSz + K) &&
4715 CurrentOrder[
I * PartSz +
Idx] !=
4716 static_cast<unsigned>(
I * PartSz +
Idx))
4717 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4720 if (SecondVecFound) {
4721 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4722 ShuffledSubMasks.
set(
I);
4728 if (!ExtractShuffles.
empty())
4729 TransformMaskToOrder(
4730 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4731 if (!ExtractShuffles[
I])
4734 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4735 for (
unsigned Idx : seq<unsigned>(Sz)) {
4736 int K =
I * PartSz +
Idx;
4739 if (!TE.ReuseShuffleIndices.empty())
4740 K = TE.ReuseShuffleIndices[K];
4743 if (!TE.ReorderIndices.empty())
4744 K = std::distance(TE.ReorderIndices.begin(),
4745 find(TE.ReorderIndices, K));
4746 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4749 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4751 .getKnownMinValue());
4756 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4757 if (ShuffledSubMasks.
any())
4758 return std::nullopt;
4759 PartSz = NumScalars;
4762 if (!Entries.empty())
4763 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4764 if (!GatherShuffles[
I])
4766 return std::max(Entries[
I].front()->getVectorFactor(),
4767 Entries[
I].back()->getVectorFactor());
4770 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4771 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4772 return std::nullopt;
4773 return std::move(CurrentOrder);
4778 bool CompareOpcodes =
true) {
4782 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4783 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4784 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4785 (!GEP2 || GEP2->getNumOperands() == 2) &&
4786 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4787 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4790 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4794template <
typename T>
4796 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4798 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4799 return CommonAlignment;
4805 "Order is empty. Please check it before using isReverseOrder.");
4806 unsigned Sz = Order.
size();
4808 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4819static std::optional<Value *>
4825 const SCEV *PtrSCEVLowest =
nullptr;
4826 const SCEV *PtrSCEVHighest =
nullptr;
4832 return std::nullopt;
4834 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4835 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4839 if (isa<SCEVCouldNotCompute>(Diff))
4840 return std::nullopt;
4842 PtrSCEVLowest = PtrSCEV;
4846 if (isa<SCEVCouldNotCompute>(Diff1))
4847 return std::nullopt;
4849 PtrSCEVHighest = PtrSCEV;
4855 if (isa<SCEVCouldNotCompute>(Dist))
4856 return std::nullopt;
4857 int Size =
DL.getTypeStoreSize(ElemTy);
4858 auto TryGetStride = [&](
const SCEV *Dist,
4859 const SCEV *Multiplier) ->
const SCEV * {
4860 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4861 if (M->getOperand(0) == Multiplier)
4862 return M->getOperand(1);
4863 if (M->getOperand(1) == Multiplier)
4864 return M->getOperand(0);
4867 if (Multiplier == Dist)
4872 const SCEV *Stride =
nullptr;
4873 if (
Size != 1 || SCEVs.
size() > 2) {
4875 Stride = TryGetStride(Dist, Sz);
4877 return std::nullopt;
4879 if (!Stride || isa<SCEVConstant>(Stride))
4880 return std::nullopt;
4883 using DistOrdPair = std::pair<int64_t, int>;
4885 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4887 bool IsConsecutive =
true;
4888 for (
const SCEV *PtrSCEV : SCEVs) {
4890 if (PtrSCEV != PtrSCEVLowest) {
4892 const SCEV *Coeff = TryGetStride(Diff, Stride);
4894 return std::nullopt;
4895 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4896 if (!SC || isa<SCEVCouldNotCompute>(SC))
4897 return std::nullopt;
4901 return std::nullopt;
4902 Dist = SC->getAPInt().getZExtValue();
4906 return std::nullopt;
4907 auto Res = Offsets.emplace(Dist, Cnt);
4909 return std::nullopt;
4911 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4914 if (Offsets.size() != SCEVs.
size())
4915 return std::nullopt;
4916 SortedIndices.
clear();
4917 if (!IsConsecutive) {
4921 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4922 SortedIndices[Cnt] = Pair.second;
4932static std::pair<InstructionCost, InstructionCost>
4948 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4951 Mask, NumSrcElts, NumSubElts,
Index)) {
4952 if (
Index + NumSubElts > NumSrcElts &&
4953 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4969 if (
Index % SubVecVF == 0) {
4977 std::iota(
Mask.begin(),
Mask.end(), 0);
4978 for (
unsigned I : seq<unsigned>(SubVecVF))
4981 Vec = Generator(Vec, V, Mask);
4985 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4997 unsigned SubVecVF,
unsigned Index) {
4998 if (
Index % SubVecVF == 0) {
5006 std::iota(Mask.begin(), Mask.end(),
Index);
5014 unsigned *BestVF,
bool TryRecursiveCheck)
const {
5027 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5033 const unsigned Sz = VL.
size();
5035 auto *POIter = PointerOps.
begin();
5036 for (
Value *V : VL) {
5037 auto *L = dyn_cast<LoadInst>(V);
5038 if (!L || !L->isSimple())
5040 *POIter = L->getPointerOperand();
5049 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5069 if (Order.
empty()) {
5070 Ptr0 = PointerOps.
front();
5071 PtrN = PointerOps.
back();
5073 Ptr0 = PointerOps[Order.
front()];
5074 PtrN = PointerOps[Order.
back()];
5076 std::optional<int> Diff =
5079 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5085 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5099 auto IsAnyPointerUsedOutGraph =
5100 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5101 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5102 return !getTreeEntry(U) && !MustGather.contains(U);
5105 const unsigned AbsoluteDiff = std::abs(*Diff);
5106 if (IsPossibleStrided &&
5107 (IsAnyPointerUsedOutGraph ||
5108 (AbsoluteDiff > Sz &&
5111 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
5112 *Diff == -(
static_cast<int>(Sz) - 1))) {
5113 int Stride = *Diff /
static_cast<int>(Sz - 1);
5114 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5126 else if (
Ptr != Ptr0)
5130 if (((Dist / Stride) * Stride) != Dist ||
5131 !Dists.
insert(Dist).second)
5134 if (Dists.
size() == Sz)
5143 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5145 bool ProfitableGatherPointers) {
5150 auto [ScalarGEPCost, VectorGEPCost] =
5152 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5158 VecTy->getNumElements());
5159 if (
static_cast<unsigned>(
count_if(
5160 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5166 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5185 false, CommonAlignment,
CostKind) +
5186 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5193 constexpr unsigned ListLimit = 4;
5194 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5203 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5213 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5226 DemandedElts.
setBits(Cnt, Cnt + VF);
5241 if (!DemandedElts.
isZero()) {
5246 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5247 if (DemandedElts[
Idx])
5254 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5259 LI0->getPointerOperand(),
5260 Instruction::GetElementPtr,
CostKind, ScalarTy,
5264 if (
static_cast<unsigned>(
5265 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5266 PointerOps.
size() - 1 ||
5286 LI0->getPointerAddressSpace(),
CostKind,
5292 LI0->getPointerOperand(),
5299 LI0->getPointerOperand(),
5309 for (
int Idx : seq<int>(0, VL.
size()))
5319 if (MaskedGatherCost >= VecLdCost &&
5332 bool ProfitableGatherPointers =
5333 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5334 return L->isLoopInvariant(V);
5336 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5337 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5339 (
GEP &&
GEP->getNumOperands() == 2 &&
5340 isa<Constant, Instruction>(
GEP->getOperand(1)));
5347 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5348 ProfitableGatherPointers))
5361 "Expected list of pointer operands.");
5371 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5373 SortedIndices.
clear();
5375 auto Key = std::make_pair(BBs[Cnt + 1],
5379 std::optional<int> Diff = getPointersDiff(
5380 ElemTy, std::get<0>(Base.front()), ElemTy,
5386 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5392 if (Bases.
size() > VL.
size() / 2 - 1)
5396 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5403 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5404 Bases.
front().second.size() == VL.
size()))
5409 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5418 FirstPointers.
insert(P1);
5419 SecondPointers.
insert(P2);
5425 "Unable to find matching root.");
5428 for (
auto &
Base : Bases) {
5429 for (
auto &Vec :
Base.second) {
5430 if (Vec.size() > 1) {
5431 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5432 const std::tuple<Value *, int, unsigned> &
Y) {
5433 return std::get<1>(
X) < std::get<1>(
Y);
5435 int InitialOffset = std::get<1>(Vec[0]);
5436 bool AnyConsecutive =
5438 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5442 if (!AnyConsecutive)
5447 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5451 for (
auto &
T : Bases)
5452 for (
const auto &Vec :
T.second)
5453 for (
const auto &
P : Vec)
5457 "Expected SortedIndices to be the size of VL");
5461std::optional<BoUpSLP::OrdersType>
5463 assert(TE.isGather() &&
"Expected gather node only.");
5464 Type *ScalarTy = TE.Scalars[0]->getType();
5467 Ptrs.
reserve(TE.Scalars.size());
5469 BBs.
reserve(TE.Scalars.size());
5470 for (
Value *V : TE.Scalars) {
5471 auto *L = dyn_cast<LoadInst>(V);
5472 if (!L || !L->isSimple())
5473 return std::nullopt;
5479 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5481 return std::move(Order);
5482 return std::nullopt;
5493 if (VU->
getType() != V->getType())
5496 if (!VU->
hasOneUse() && !V->hasOneUse())
5502 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5508 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5509 bool IsReusedIdx =
false;
5511 if (IE2 == VU && !IE1)
5513 if (IE1 == V && !IE2)
5514 return V->hasOneUse();
5515 if (IE1 && IE1 != V) {
5517 IsReusedIdx |= ReusedIdx.
test(Idx1);
5518 ReusedIdx.
set(Idx1);
5519 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5522 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5524 if (IE2 && IE2 != VU) {
5526 IsReusedIdx |= ReusedIdx.
test(Idx2);
5527 ReusedIdx.
set(Idx2);
5528 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5531 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5533 }
while (!IsReusedIdx && (IE1 || IE2));
5537std::optional<BoUpSLP::OrdersType>
5541 if (!TE.ReuseShuffleIndices.empty()) {
5543 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5544 "Reshuffling scalars not yet supported for nodes with padding");
5547 return std::nullopt;
5555 unsigned Sz = TE.Scalars.size();
5556 if (TE.isGather()) {
5557 if (std::optional<OrdersType> CurrentOrder =
5562 ::addMask(Mask, TE.ReuseShuffleIndices);
5563 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5564 unsigned Sz = TE.Scalars.size();
5565 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5568 Res[
Idx + K * Sz] =
I + K * Sz;
5570 return std::move(Res);
5573 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5575 2 * TE.getVectorFactor())) == 1)
5576 return std::nullopt;
5580 if (TE.ReorderIndices.empty())
5581 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5584 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5585 unsigned VF = ReorderMask.
size();
5589 for (
unsigned I = 0;
I < VF;
I += Sz) {
5591 unsigned UndefCnt = 0;
5592 unsigned Limit = std::min(Sz, VF -
I);
5601 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5603 return std::nullopt;
5605 for (
unsigned K = 0; K < NumParts; ++K) {
5606 unsigned Idx = Val + Sz * K;
5608 ResOrder[
Idx] =
I + K;
5611 return std::move(ResOrder);
5613 unsigned VF = TE.getVectorFactor();
5616 TE.ReuseShuffleIndices.end());
5617 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5619 if (isa<PoisonValue>(V))
5621 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5622 return Idx && *Idx < Sz;
5624 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5625 "by BinaryOperator and CastInst.");
5627 if (TE.ReorderIndices.empty())
5628 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5631 for (
unsigned I = 0;
I < VF; ++
I) {
5632 int &
Idx = ReusedMask[
I];
5635 Value *V = TE.Scalars[ReorderMask[
Idx]];
5637 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5643 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5644 auto *It = ResOrder.
begin();
5645 for (
unsigned K = 0; K < VF; K += Sz) {
5649 std::iota(SubMask.begin(), SubMask.end(), 0);
5651 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5652 std::advance(It, Sz);
5655 return Data.index() ==
Data.value();
5657 return std::nullopt;
5658 return std::move(ResOrder);
5660 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5661 any_of(TE.UserTreeIndices,
5663 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5665 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5666 return std::nullopt;
5667 if ((TE.State == TreeEntry::Vectorize ||
5668 TE.State == TreeEntry::StridedVectorize) &&
5669 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5670 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5671 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5672 "BinaryOperator and CastInst.");
5673 return TE.ReorderIndices;
5675 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5676 if (!TE.ReorderIndices.empty())
5677 return TE.ReorderIndices;
5680 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5681 if (!V->hasNUsesOrMore(1))
5683 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5688 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5690 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5696 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5697 auto *NodeA = DT->
getNode(BB1);
5698 auto *NodeB = DT->
getNode(BB2);
5699 assert(NodeA &&
"Should only process reachable instructions");
5700 assert(NodeB &&
"Should only process reachable instructions");
5701 assert((NodeA == NodeB) ==
5702 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5703 "Different nodes should have different DFS numbers");
5704 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5706 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5707 Value *V1 = TE.Scalars[I1];
5708 Value *V2 = TE.Scalars[I2];
5709 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5711 if (isa<PoisonValue>(V1))
5713 if (isa<PoisonValue>(V2))
5719 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5720 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5721 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5722 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5723 FirstUserOfPhi2->getParent());
5724 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5725 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5726 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5727 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5733 if (UserBVHead[I1] && !UserBVHead[I2])
5735 if (!UserBVHead[I1])
5737 if (UserBVHead[I1] == UserBVHead[I2])
5740 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5742 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5749 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5750 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5751 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5752 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5755 if (EE1->getOperand(0) == EE2->getOperand(0))
5757 if (!Inst1 && Inst2)
5759 if (Inst1 && Inst2) {
5767 "Expected either instructions or arguments vector operands.");
5768 return P1->getArgNo() < P2->getArgNo();
5773 std::iota(Phis.
begin(), Phis.
end(), 0);
5776 return std::nullopt;
5777 return std::move(Phis);
5779 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5783 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5784 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5785 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5787 auto *EE = dyn_cast<ExtractElementInst>(V);
5788 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5794 canReuseExtract(TE.Scalars, CurrentOrder,
true);
5795 if (Reuse || !CurrentOrder.
empty())
5796 return std::move(CurrentOrder);
5804 int Sz = TE.Scalars.size();
5806 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5808 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5809 if (It == TE.Scalars.begin())
5812 if (It != TE.Scalars.end()) {
5814 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5829 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5832 return std::move(Order);
5837 return std::nullopt;
5838 if (TE.Scalars.size() >= 3)
5843 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5847 CurrentOrder, PointerOps);
5849 return std::move(CurrentOrder);
5855 return CurrentOrder;
5857 return std::nullopt;
5867 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5869 if (Cluster != FirstCluster)
5875void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5878 const unsigned Sz =
TE.Scalars.size();
5880 if (!
TE.isGather() ||
5887 addMask(NewMask,
TE.ReuseShuffleIndices);
5889 TE.ReorderIndices.clear();
5896 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5897 *
End =
TE.ReuseShuffleIndices.end();
5898 It !=
End; std::advance(It, Sz))
5899 std::iota(It, std::next(It, Sz), 0);
5905 "Expected same size of orders");
5906 unsigned Sz = Order.
size();
5908 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5909 if (Order[
Idx] != Sz)
5910 UsedIndices.
set(Order[
Idx]);
5912 if (SecondaryOrder.
empty()) {
5913 for (
unsigned Idx : seq<unsigned>(0, Sz))
5914 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5917 for (
unsigned Idx : seq<unsigned>(0, Sz))
5918 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5919 !UsedIndices.
test(SecondaryOrder[
Idx]))
5920 Order[
Idx] = SecondaryOrder[
Idx];
5940 ExternalUserReorderMap;
5945 const std::unique_ptr<TreeEntry> &TE) {
5948 findExternalStoreUsersReorderIndices(TE.get());
5949 if (!ExternalUserReorderIndices.
empty()) {
5950 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5952 std::move(ExternalUserReorderIndices));
5958 if (TE->hasState() && TE->isAltShuffle()) {
5961 unsigned Opcode0 = TE->getOpcode();
5962 unsigned Opcode1 = TE->getAltOpcode();
5965 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5966 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5972 if (std::optional<OrdersType> CurrentOrder =
5982 const TreeEntry *UserTE = TE.get();
5984 if (UserTE->UserTreeIndices.size() != 1)
5987 return EI.UserTE->State == TreeEntry::Vectorize &&
5988 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5991 UserTE = UserTE->UserTreeIndices.back().UserTE;
5994 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5995 if (!(TE->State == TreeEntry::Vectorize ||
5996 TE->State == TreeEntry::StridedVectorize) ||
5997 !TE->ReuseShuffleIndices.empty())
5998 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5999 if (TE->State == TreeEntry::Vectorize &&
6000 TE->getOpcode() == Instruction::PHI)
6001 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
6006 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
6007 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6008 auto It = VFToOrderedEntries.
find(VF);
6009 if (It == VFToOrderedEntries.
end())
6024 for (
const TreeEntry *OpTE : OrderedEntries) {
6027 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6030 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6032 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6033 auto It = GathersToOrders.find(OpTE);
6034 if (It != GathersToOrders.end())
6037 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6038 auto It = AltShufflesToOrders.find(OpTE);
6039 if (It != AltShufflesToOrders.end())
6042 if (OpTE->State == TreeEntry::Vectorize &&
6043 OpTE->getOpcode() == Instruction::PHI) {
6044 auto It = PhisToOrders.
find(OpTE);
6045 if (It != PhisToOrders.
end())
6048 return OpTE->ReorderIndices;
6051 auto It = ExternalUserReorderMap.
find(OpTE);
6052 if (It != ExternalUserReorderMap.
end()) {
6053 const auto &ExternalUserReorderIndices = It->second;
6057 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6058 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6059 ExternalUserReorderIndices.size();
6061 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
6062 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6069 if (OpTE->State == TreeEntry::Vectorize &&
6070 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6071 assert(!OpTE->isAltShuffle() &&
6072 "Alternate instructions are only supported by BinaryOperator "
6076 unsigned E = Order.size();
6079 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6082 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6084 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6087 if (OrdersUses.empty())
6090 unsigned IdentityCnt = 0;
6091 unsigned FilledIdentityCnt = 0;
6093 for (
auto &Pair : OrdersUses) {
6095 if (!Pair.first.empty())
6096 FilledIdentityCnt += Pair.second;
6097 IdentityCnt += Pair.second;
6102 unsigned Cnt = IdentityCnt;
6103 for (
auto &Pair : OrdersUses) {
6107 if (Cnt < Pair.second ||
6108 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6109 Cnt == Pair.second && !BestOrder.
empty() &&
6112 BestOrder = Pair.first;
6125 unsigned E = BestOrder.
size();
6127 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6130 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6132 if (TE->Scalars.size() != VF) {
6133 if (TE->ReuseShuffleIndices.size() == VF) {
6139 return EI.UserTE->Scalars.size() == VF ||
6140 EI.UserTE->Scalars.size() ==
6143 "All users must be of VF size.");
6151 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6156 return isa<ShuffleVectorInst>(
6157 EI.UserTE->getMainOp());
6159 "Does not know how to reorder.");
6163 reorderNodeWithReuses(*TE, Mask);
6167 if ((TE->State == TreeEntry::Vectorize ||
6168 TE->State == TreeEntry::StridedVectorize) &&
6171 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6172 assert(!TE->isAltShuffle() &&
6173 "Alternate instructions are only supported by BinaryOperator "
6178 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6179 TE->reorderOperands(Mask);
6182 TE->reorderOperands(Mask);
6183 assert(TE->ReorderIndices.empty() &&
6184 "Expected empty reorder sequence.");
6187 if (!TE->ReuseShuffleIndices.empty()) {
6194 addMask(NewReuses, TE->ReuseShuffleIndices);
6195 TE->ReuseShuffleIndices.swap(NewReuses);
6201bool BoUpSLP::canReorderOperands(
6202 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6205 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6206 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6207 return OpData.first ==
I &&
6208 (OpData.second->State == TreeEntry::Vectorize ||
6209 OpData.second->State == TreeEntry::StridedVectorize);
6212 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6214 if (
any_of(TE->UserTreeIndices,
6215 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6219 Edges.emplace_back(
I, TE);
6225 if (TE->State != TreeEntry::Vectorize &&
6226 TE->State != TreeEntry::StridedVectorize &&
6227 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6231 TreeEntry *
Gather =
nullptr;
6233 [&
Gather, UserTE,
I](TreeEntry *TE) {
6234 assert(TE->State != TreeEntry::Vectorize &&
6235 TE->State != TreeEntry::StridedVectorize &&
6236 "Only non-vectorized nodes are expected.");
6237 if (
any_of(TE->UserTreeIndices,
6238 [UserTE,
I](
const EdgeInfo &EI) {
6239 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6241 assert(TE->isSame(UserTE->getOperand(
I)) &&
6242 "Operand entry does not match operands.");
6263 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6264 if (TE->State != TreeEntry::Vectorize &&
6265 TE->State != TreeEntry::StridedVectorize)
6267 if (std::optional<OrdersType> CurrentOrder =
6269 OrderedEntries.
insert(TE.get());
6270 if (!(TE->State == TreeEntry::Vectorize ||
6271 TE->State == TreeEntry::StridedVectorize) ||
6272 !TE->ReuseShuffleIndices.empty())
6273 GathersToOrders.
insert(TE.get());
6282 while (!OrderedEntries.
empty()) {
6287 for (TreeEntry *TE : OrderedEntries) {
6288 if (!(TE->State == TreeEntry::Vectorize ||
6289 TE->State == TreeEntry::StridedVectorize ||
6290 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6291 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6294 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6296 !Visited.
insert(TE).second) {
6302 for (
EdgeInfo &EI : TE->UserTreeIndices)
6306 for (TreeEntry *TE : Filtered)
6307 OrderedEntries.remove(TE);
6309 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6311 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6312 return Data1.first->Idx > Data2.first->Idx;
6314 for (
auto &
Data : UsersVec) {
6317 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6319 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6320 OrderedEntries.remove(
Op.second);
6333 for (
const auto &
Op :
Data.second) {
6334 TreeEntry *OpTE =
Op.second;
6335 if (!VisitedOps.
insert(OpTE).second)
6337 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6339 const auto Order = [&]() ->
const OrdersType {
6340 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6343 return OpTE->ReorderIndices;
6347 if (Order.size() == 1)
6350 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6351 return P.second == OpTE;
6354 if (OpTE->State == TreeEntry::Vectorize &&
6355 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6356 assert(!OpTE->isAltShuffle() &&
6357 "Alternate instructions are only supported by BinaryOperator "
6361 unsigned E = Order.size();
6364 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6367 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6370 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6372 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6373 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6374 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6375 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6376 (IgnoreReorder && TE->Idx == 0))
6378 if (TE->isGather()) {
6387 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6388 TreeEntry *UserTE = EI.
UserTE;
6389 if (!VisitedUsers.
insert(UserTE).second)
6394 if (AllowsReordering(UserTE))
6402 if (
static_cast<unsigned>(
count_if(
6403 Ops, [UserTE, &AllowsReordering](
6404 const std::pair<unsigned, TreeEntry *> &
Op) {
6405 return AllowsReordering(
Op.second) &&
6408 return EI.UserTE == UserTE;
6410 })) <= Ops.
size() / 2)
6411 ++Res.first->second;
6414 if (OrdersUses.empty()) {
6415 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6416 OrderedEntries.remove(
Op.second);
6420 unsigned IdentityCnt = 0;
6421 unsigned VF =
Data.second.front().second->getVectorFactor();
6423 for (
auto &Pair : OrdersUses) {
6425 IdentityCnt += Pair.second;
6430 unsigned Cnt = IdentityCnt;
6431 for (
auto &Pair : OrdersUses) {
6435 if (Cnt < Pair.second) {
6437 BestOrder = Pair.first;
6445 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6446 OrderedEntries.remove(
Op.second);
6455 unsigned E = BestOrder.
size();
6457 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6459 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6460 TreeEntry *TE =
Op.second;
6461 OrderedEntries.remove(TE);
6462 if (!VisitedOps.
insert(TE).second)
6464 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6465 reorderNodeWithReuses(*TE, Mask);
6469 if (TE->State != TreeEntry::Vectorize &&
6470 TE->State != TreeEntry::StridedVectorize &&
6471 (TE->State != TreeEntry::ScatterVectorize ||
6472 TE->ReorderIndices.empty()))
6474 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6475 TE->ReorderIndices.empty()) &&
6476 "Non-matching sizes of user/operand entries.");
6478 if (IgnoreReorder && TE == VectorizableTree.front().get())
6479 IgnoreReorder =
false;
6482 for (TreeEntry *
Gather : GatherOps) {
6484 "Unexpected reordering of gathers.");
6485 if (!
Gather->ReuseShuffleIndices.empty()) {
6491 OrderedEntries.remove(
Gather);
6495 if (
Data.first->State != TreeEntry::Vectorize ||
6496 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6497 Data.first->getMainOp()) ||
6498 Data.first->isAltShuffle())
6499 Data.first->reorderOperands(Mask);
6500 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6501 Data.first->isAltShuffle() ||
6502 Data.first->State == TreeEntry::StridedVectorize) {
6506 if (
Data.first->ReuseShuffleIndices.empty() &&
6507 !
Data.first->ReorderIndices.empty() &&
6508 !
Data.first->isAltShuffle()) {
6511 OrderedEntries.insert(
Data.first);
6519 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6520 VectorizableTree.front()->ReuseShuffleIndices.empty())
6521 VectorizableTree.front()->ReorderIndices.clear();
6524Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6525 if ((Entry.getOpcode() == Instruction::Store ||
6526 Entry.getOpcode() == Instruction::Load) &&
6527 Entry.State == TreeEntry::StridedVectorize &&
6528 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6529 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6530 return dyn_cast<Instruction>(Entry.Scalars.front());
6537 for (
auto &TEPtr : VectorizableTree) {
6538 TreeEntry *Entry = TEPtr.get();
6541 if (Entry->isGather())
6545 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6546 Value *Scalar = Entry->Scalars[Lane];
6547 if (!isa<Instruction>(Scalar))
6550 auto It = ScalarToExtUses.
find(Scalar);
6551 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6555 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6556 if (ExtI != ExternallyUsedValues.
end()) {
6557 int FoundLane = Entry->findLaneForValue(Scalar);
6558 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6559 << FoundLane <<
" from " << *Scalar <<
".\n");
6560 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6561 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6564 for (
User *U : Scalar->users()) {
6572 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6576 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6580 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6582 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6583 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6585 assert(!UseEntry->isGather() &&
"Bad state");
6589 if (It != ScalarToExtUses.
end()) {
6590 ExternalUses[It->second].User =
nullptr;
6595 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6597 int FoundLane = Entry->findLaneForValue(Scalar);
6599 <<
" from lane " << FoundLane <<
" from " << *Scalar
6601 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6602 ExternalUses.emplace_back(Scalar, U, FoundLane);
6611BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6615 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6616 Value *V = TE->Scalars[Lane];
6618 if (!isa<Instruction>(V))
6625 for (
User *U : V->users()) {
6626 auto *SI = dyn_cast<StoreInst>(U);
6629 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6633 if (getTreeEntry(U))
6638 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6639 SI->getValueOperand()->getType(),
Ptr}];
6642 if (StoresVec.size() > Lane)
6644 if (!StoresVec.empty()) {
6646 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6647 SI->getValueOperand()->getType(),
6648 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6654 StoresVec.push_back(SI);
6659 for (
auto &
P : PtrToStoresMap) {
6660 Res[
I].swap(
P.second);
6667 OrdersType &ReorderIndices)
const {
6678 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6680 std::optional<int> Diff =
6682 SI->getPointerOperand(), *
DL, *SE,
6688 if (StoreOffsetVec.
size() != StoresVec.
size())
6690 sort(StoreOffsetVec,
6691 [](
const std::pair<int, unsigned> &L,
6692 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6695 for (
const auto &
P : StoreOffsetVec) {
6696 if (
Idx > 0 &&
P.first != PrevDist + 1)
6704 ReorderIndices.assign(StoresVec.
size(), 0);
6705 bool IsIdentity =
true;
6707 ReorderIndices[
P.second] =
I;
6708 IsIdentity &=
P.second ==
I;
6714 ReorderIndices.clear();
6721 for (
unsigned Idx : Order)
6728BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6729 unsigned NumLanes =
TE->Scalars.size();
6742 if (StoresVec.
size() != NumLanes)
6747 if (!canFormVector(StoresVec, ReorderIndices))
6752 ExternalReorderIndices.
push_back(ReorderIndices);
6754 return ExternalReorderIndices;
6760 UserIgnoreList = &UserIgnoreLst;
6763 buildTree_rec(Roots, 0,
EdgeInfo());
6770 buildTree_rec(Roots, 0,
EdgeInfo());
6779 bool AddNew =
true) {
6787 for (
Value *V : VL) {
6788 auto *LI = dyn_cast<LoadInst>(V);
6791 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6793 bool IsFound =
false;
6794 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6795 assert(LI->getParent() ==
Data.front().first->getParent() &&
6796 LI->getType() ==
Data.front().first->getType() &&
6800 "Expected loads with the same type, same parent and same "
6801 "underlying pointer.");
6803 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6804 Data.front().first->getPointerOperand(),
DL, SE,
6808 auto It = Map.find(*Dist);
6809 if (It != Map.end() && It->second != LI)
6811 if (It == Map.end()) {
6812 Data.emplace_back(LI, *Dist);
6813 Map.try_emplace(*Dist, LI);
6823 auto FindMatchingLoads =
6828 int &
Offset,
unsigned &Start) {
6830 return GatheredLoads.
end();
6840 std::optional<int> Dist =
6842 Data.front().first->getType(),
6843 Data.front().first->getPointerOperand(),
DL, SE,
6849 for (std::pair<LoadInst *, int>
P :
Data) {
6855 unsigned NumUniques = 0;
6856 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6857 bool Used = DataLoads.
contains(Pair.first);
6858 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6862 Repeated.insert(Cnt);
6865 if (NumUniques > 0 &&
6866 (Loads.
size() == NumUniques ||
6867 (Loads.
size() - NumUniques >= 2 &&
6868 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6874 return std::next(GatheredLoads.
begin(),
Idx);
6878 return GatheredLoads.
end();
6880 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6884 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6886 while (It != GatheredLoads.
end()) {
6887 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6888 for (
unsigned Idx : LocalToAdd)
6890 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6891 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6895 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6899 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6908 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6909 return PD.front().first->getParent() == LI->
getParent() &&
6910 PD.front().first->getType() == LI->
getType();
6912 while (It != GatheredLoads.
end()) {
6915 std::next(It), GatheredLoads.
end(),
6916 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6917 return PD.front().first->getParent() == LI->getParent() &&
6918 PD.front().first->getType() == LI->getType();
6922 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6923 AddNewLoads(GatheredLoads.emplace_back());
6928void BoUpSLP::tryToVectorizeGatheredLoads(
6931 8> &GatheredLoads) {
6932 GatheredLoadsEntriesFirst = VectorizableTree.size();
6935 LoadEntriesToVectorize.
size());
6936 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6937 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6938 VectorizableTree[
Idx]->Scalars.end());
6941 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6942 const std::pair<LoadInst *, int> &L2) {
6943 return L1.second > L2.second;
6949 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6950 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6958 bool Final,
unsigned MaxVF) {
6960 unsigned StartIdx = 0;
6965 *
TTI, Loads.
front()->getType(), MaxVF);
6967 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6973 if (Final && CandidateVFs.
empty())
6976 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6977 for (
unsigned NumElts : CandidateVFs) {
6978 if (Final && NumElts > BestVF)
6981 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6985 if (VectorizedLoads.count(Slice.
front()) ||
6986 VectorizedLoads.count(Slice.
back()) ||
6992 bool AllowToVectorize =
false;
7000 if (LI->hasOneUse())
7006 if (
static_cast<unsigned int>(std::distance(
7007 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7009 if (!IsLegalBroadcastLoad)
7013 for (
User *U : LI->users()) {
7014 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
7016 if (
const TreeEntry *UTE = getTreeEntry(U)) {
7017 for (
int I : seq<int>(UTE->getNumOperands())) {
7018 if (
all_of(UTE->getOperand(
I),
7019 [LI](
Value *V) { return V == LI; }))
7028 AllowToVectorize = CheckIfAllowed(Slice);
7032 any_of(ValueToGatherNodes.at(Slice.front()),
7033 [=](
const TreeEntry *TE) {
7034 return TE->Scalars.size() == 2 &&
7035 ((TE->Scalars.front() == Slice.front() &&
7036 TE->Scalars.back() == Slice.back()) ||
7037 (TE->Scalars.front() == Slice.back() &&
7038 TE->Scalars.back() == Slice.front()));
7043 if (AllowToVectorize) {
7048 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
7050 PointerOps, &BestVF);
7052 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7054 if (MaskedGatherVectorized.
empty() ||
7055 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7060 Results.emplace_back(Values, LS);
7061 VectorizedLoads.insert(Slice.begin(), Slice.end());
7064 if (Cnt == StartIdx)
7065 StartIdx += NumElts;
7068 if (StartIdx >= Loads.
size())
7072 if (!MaskedGatherVectorized.
empty() &&
7073 Cnt < MaskedGatherVectorized.
back() + NumElts)
7079 if (!AllowToVectorize || BestVF == 0)
7083 for (
unsigned Cnt : MaskedGatherVectorized) {
7085 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7089 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7091 if (Cnt == StartIdx)
7092 StartIdx += NumElts;
7096 if (!VectorizedLoads.contains(LI))
7097 NonVectorized.push_back(LI);
7101 auto ProcessGatheredLoads =
7104 bool Final =
false) {
7106 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7107 if (LoadsDists.size() <= 1) {
7108 NonVectorized.
push_back(LoadsDists.back().first);
7113 transform(LoadsDists, OriginalLoads.begin(),
7114 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7119 unsigned MaxConsecutiveDistance = 0;
7120 unsigned CurrentConsecutiveDist = 1;
7121 int LastDist = LocalLoadsDists.
front().second;
7122 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7123 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7124 if (getTreeEntry(
L.first))
7126 assert(LastDist >=
L.second &&
7127 "Expected first distance always not less than second");
7128 if (
static_cast<unsigned>(LastDist -
L.second) ==
7129 CurrentConsecutiveDist) {
7130 ++CurrentConsecutiveDist;
7131 MaxConsecutiveDistance =
7132 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7136 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7139 CurrentConsecutiveDist = 1;
7140 LastDist =
L.second;
7143 if (Loads.
size() <= 1)
7145 if (AllowMaskedGather)
7146 MaxConsecutiveDistance = Loads.
size();
7147 else if (MaxConsecutiveDistance < 2)
7152 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7153 Final, MaxConsecutiveDistance);
7155 OriginalLoads.size() == Loads.
size() &&
7156 MaxConsecutiveDistance == Loads.
size() &&
7161 VectorizedLoads.
clear();
7165 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7166 UnsortedNonVectorized, Final,
7167 OriginalLoads.size());
7168 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7169 SortedNonVectorized.
swap(UnsortedNonVectorized);
7170 Results.swap(UnsortedResults);
7175 << Slice.
size() <<
")\n");
7176 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7177 for (
Value *L : Slice)
7178 if (!getTreeEntry(L))
7179 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7185 unsigned MaxVF = Slice.size();
7186 unsigned UserMaxVF = 0;
7187 unsigned InterleaveFactor = 0;
7192 std::optional<unsigned> InterleavedLoadsDistance = 0;
7194 std::optional<unsigned> CommonVF = 0;
7198 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7199 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7202 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7204 if (*CommonVF == 0) {
7205 CommonVF =
E->Scalars.size();
7208 if (*CommonVF !=
E->Scalars.size())
7212 if (Pos !=
Idx && InterleavedLoadsDistance) {
7215 if (isa<Constant>(V))
7217 if (getTreeEntry(V))
7219 const auto &Nodes = ValueToGatherNodes.at(V);
7220 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7221 !is_contained(Slice, V);
7223 InterleavedLoadsDistance.reset();
7227 if (*InterleavedLoadsDistance == 0) {
7228 InterleavedLoadsDistance =
Idx - Pos;
7231 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7232 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7233 InterleavedLoadsDistance.reset();
7234 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7238 DeinterleavedNodes.
clear();
7240 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7241 CommonVF.value_or(0) != 0) {
7242 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7243 unsigned VF = *CommonVF;
7247 if (InterleaveFactor <= Slice.size() &&
7251 cast<LoadInst>(Slice.front())->getAlign(),
7252 cast<LoadInst>(Slice.front())
7256 UserMaxVF = InterleaveFactor * VF;
7258 InterleaveFactor = 0;
7263 unsigned ConsecutiveNodesSize = 0;
7264 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7265 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7266 [&, Slice = Slice](
const auto &
P) {
7268 return std::get<1>(
P).contains(V);
7270 if (It == Slice.end())
7273 VectorizableTree[std::get<0>(
P)]->Scalars;
7274 ConsecutiveNodesSize += VL.
size();
7275 unsigned Start = std::distance(Slice.begin(), It);
7276 unsigned Sz = Slice.size() - Start;
7277 return Sz < VL.
size() ||
7278 Slice.slice(std::distance(Slice.begin(), It),
7284 if (InterleaveFactor == 0 &&
7285 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7286 [&, Slice = Slice](
unsigned Idx) {
7288 SmallVector<Value *> PointerOps;
7289 return canVectorizeLoads(
7290 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7291 Slice[Idx * UserMaxVF], Order,
7293 LoadsState::ScatterVectorize;
7296 if (Slice.size() != ConsecutiveNodesSize)
7297 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7299 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7300 bool IsVectorized =
true;
7301 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7303 Slice.
slice(
I, std::min(VF,
E -
I));
7304 if (getTreeEntry(SubSlice.
front()))
7308 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7309 [&](
const auto &
P) {
7311 VectorizableTree[std::get<0>(
P)]
7316 unsigned Sz = VectorizableTree.size();
7317 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7318 if (Sz == VectorizableTree.size()) {
7319 IsVectorized =
false;
7322 if (InterleaveFactor > 0) {
7323 VF = 2 * (MaxVF / InterleaveFactor);
7324 InterleaveFactor = 0;
7333 NonVectorized.
append(SortedNonVectorized);
7335 return NonVectorized;
7337 for (
const auto &GLs : GatheredLoads) {
7338 const auto &
Ref = GLs.second;
7340 if (!
Ref.empty() && !NonVectorized.
empty() &&
7342 Ref.begin(),
Ref.end(), 0u,
7344 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7345 return S + LoadsDists.size();
7346 }) != NonVectorized.
size() &&
7347 IsMaskedGatherSupported(NonVectorized)) {
7349 for (
LoadInst *LI : NonVectorized) {
7357 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7361 for (
unsigned Idx : LoadEntriesToVectorize) {
7362 const TreeEntry &
E = *VectorizableTree[
Idx];
7365 if (!
E.ReorderIndices.empty()) {
7372 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7376 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7377 VectorizableTree.size())
7378 GatheredLoadsEntriesFirst.reset();
7385 Value *NeedsScheduling =
nullptr;
7386 for (
Value *V : VL) {
7389 if (!NeedsScheduling) {
7390 NeedsScheduling = V;
7395 return NeedsScheduling;
7406 bool AllowAlternate) {
7410 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7413 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7418 if (isa<ExtractElementInst, UndefValue>(V))
7420 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7422 !isa<UndefValue>(EI->getIndexOperand()))
7425 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7428 if ((isa<BinaryOperator, CastInst>(
I)) &&
7438 : cast<CastInst>(
I)->getOperand(0)->getType()));
7440 if (isa<CastInst>(
I)) {
7441 std::pair<size_t, size_t> OpVals =
7447 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7449 if (CI->isCommutative())
7455 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7469 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7470 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7471 SubKey =
hash_value(Gep->getPointerOperand());
7475 !isa<ConstantInt>(
I->getOperand(1))) {
7483 return std::make_pair(Key, SubKey);
7493bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7495 unsigned Opcode0 = S.getOpcode();
7496 unsigned Opcode1 = S.getAltOpcode();
7500 Opcode0, Opcode1, OpcodeMask))
7503 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7506 for (
Value *V : VL) {
7507 if (isa<PoisonValue>(V)) {
7512 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7517 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7523 switch (Res.value_or(0)) {
7538 constexpr unsigned NumAltInsts = 3;
7539 unsigned NonInstCnt = 0;
7542 unsigned UndefCnt = 0;
7544 unsigned ExtraShuffleInsts = 0;
7553 return is_contained(Operands.back(), V);
7556 ++ExtraShuffleInsts;
7573 if (isa<Constant, ExtractElementInst>(V) ||
7574 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7575 if (isa<UndefValue>(V))
7581 if (!Res.second && Res.first->second == 1)
7582 ++ExtraShuffleInsts;
7583 ++Res.first->getSecond();
7584 if (
auto *
I = dyn_cast<Instruction>(V))
7585 UniqueOpcodes.
insert(
I->getOpcode());
7586 else if (Res.second)
7589 return none_of(Uniques, [&](
const auto &
P) {
7590 return P.first->hasNUsesOrMore(
P.second + 1) &&
7592 return getTreeEntry(U) || Uniques.contains(U);
7601 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7602 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7603 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7606BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7608 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7611 "Expected instructions with same/alternate opcodes only.");
7613 unsigned ShuffleOrOp =
7614 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7616 switch (ShuffleOrOp) {
7617 case Instruction::PHI: {
7620 return TreeEntry::NeedToGather;
7622 for (
Value *V : VL) {
7623 auto *
PHI = dyn_cast<PHINode>(V);
7628 if (Term &&
Term->isTerminator()) {
7630 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7631 return TreeEntry::NeedToGather;
7636 return TreeEntry::Vectorize;
7638 case Instruction::ExtractValue:
7639 case Instruction::ExtractElement: {
7640 bool Reuse = canReuseExtract(VL, CurrentOrder);
7644 return TreeEntry::NeedToGather;
7645 if (Reuse || !CurrentOrder.empty())
7646 return TreeEntry::Vectorize;
7648 return TreeEntry::NeedToGather;
7650 case Instruction::InsertElement: {
7654 for (
Value *V : VL) {
7655 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7657 "Non-constant or undef index?");
7661 return !SourceVectors.contains(V);
7664 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7665 "different source vectors.\n");
7666 return TreeEntry::NeedToGather;
7671 return SourceVectors.contains(V) && !
V->hasOneUse();
7674 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7675 "multiple uses.\n");
7676 return TreeEntry::NeedToGather;
7679 return TreeEntry::Vectorize;
7681 case Instruction::Load: {
7690 return TreeEntry::Vectorize;
7692 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7694 LoadEntriesToVectorize.insert(VectorizableTree.size());
7695 return TreeEntry::NeedToGather;
7697 return TreeEntry::ScatterVectorize;
7699 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7701 LoadEntriesToVectorize.insert(VectorizableTree.size());
7702 return TreeEntry::NeedToGather;
7704 return TreeEntry::StridedVectorize;
7708 if (
DL->getTypeSizeInBits(ScalarTy) !=
7709 DL->getTypeAllocSizeInBits(ScalarTy))
7710 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7712 auto *LI = dyn_cast<LoadInst>(V);
7713 return !LI || !LI->isSimple();
7720 return TreeEntry::NeedToGather;
7724 case Instruction::ZExt:
7725 case Instruction::SExt:
7726 case Instruction::FPToUI:
7727 case Instruction::FPToSI:
7728 case Instruction::FPExt:
7729 case Instruction::PtrToInt:
7730 case Instruction::IntToPtr:
7731 case Instruction::SIToFP:
7732 case Instruction::UIToFP:
7733 case Instruction::Trunc:
7734 case Instruction::FPTrunc:
7735 case Instruction::BitCast: {
7737 for (
Value *V : VL) {
7738 if (isa<PoisonValue>(V))
7740 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7743 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7744 return TreeEntry::NeedToGather;
7747 return TreeEntry::Vectorize;
7749 case Instruction::ICmp:
7750 case Instruction::FCmp: {
7755 for (
Value *V : VL) {
7756 if (isa<PoisonValue>(V))
7758 auto *
Cmp = cast<CmpInst>(V);
7759 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7760 Cmp->getOperand(0)->getType() != ComparedTy) {
7761 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7762 return TreeEntry::NeedToGather;
7765 return TreeEntry::Vectorize;
7767 case Instruction::Select:
7768 case Instruction::FNeg:
7769 case Instruction::Add:
7770 case Instruction::FAdd:
7771 case Instruction::Sub:
7772 case Instruction::FSub:
7773 case Instruction::Mul:
7774 case Instruction::FMul:
7775 case Instruction::UDiv:
7776 case Instruction::SDiv:
7777 case Instruction::FDiv:
7778 case Instruction::URem:
7779 case Instruction::SRem:
7780 case Instruction::FRem:
7781 case Instruction::Shl:
7782 case Instruction::LShr:
7783 case Instruction::AShr:
7784 case Instruction::And:
7785 case Instruction::Or:
7786 case Instruction::Xor:
7787 case Instruction::Freeze:
7788 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7790 auto *
I = dyn_cast<Instruction>(V);
7791 return I &&
I->isBinaryOp() && !
I->isFast();
7793 return TreeEntry::NeedToGather;
7794 return TreeEntry::Vectorize;
7795 case Instruction::GetElementPtr: {
7797 for (
Value *V : VL) {
7798 auto *
I = dyn_cast<GetElementPtrInst>(V);
7801 if (
I->getNumOperands() != 2) {
7802 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7803 return TreeEntry::NeedToGather;
7809 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7810 for (
Value *V : VL) {
7811 auto *
GEP = dyn_cast<GEPOperator>(V);
7814 Type *CurTy =
GEP->getSourceElementType();
7816 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7817 return TreeEntry::NeedToGather;
7823 for (
Value *V : VL) {
7824 auto *
I = dyn_cast<GetElementPtrInst>(V);
7827 auto *
Op =
I->getOperand(1);
7828 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7829 (
Op->getType() != Ty1 &&
7830 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7831 Op->getType()->getScalarSizeInBits() >
7832 DL->getIndexSizeInBits(
7833 V->getType()->getPointerAddressSpace())))) {
7835 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7836 return TreeEntry::NeedToGather;
7840 return TreeEntry::Vectorize;
7842 case Instruction::Store: {
7844 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7847 if (
DL->getTypeSizeInBits(ScalarTy) !=
7848 DL->getTypeAllocSizeInBits(ScalarTy)) {
7849 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7850 return TreeEntry::NeedToGather;
7854 for (
Value *V : VL) {
7855 auto *
SI = cast<StoreInst>(V);
7856 if (!
SI->isSimple()) {
7858 return TreeEntry::NeedToGather;
7867 if (CurrentOrder.empty()) {
7868 Ptr0 = PointerOps.
front();
7869 PtrN = PointerOps.
back();
7871 Ptr0 = PointerOps[CurrentOrder.front()];
7872 PtrN = PointerOps[CurrentOrder.back()];
7874 std::optional<int> Dist =
7877 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7878 return TreeEntry::Vectorize;
7882 return TreeEntry::NeedToGather;
7884 case Instruction::Call: {
7885 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7887 auto *
I = dyn_cast<Instruction>(V);
7888 return I && !
I->isFast();
7890 return TreeEntry::NeedToGather;
7893 CallInst *CI = cast<CallInst>(VL0);
7904 return TreeEntry::NeedToGather;
7909 for (
unsigned J = 0; J != NumArgs; ++J)
7912 for (
Value *V : VL) {
7913 CallInst *CI2 = dyn_cast<CallInst>(V);
7919 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7921 return TreeEntry::NeedToGather;
7925 for (
unsigned J = 0; J != NumArgs; ++J) {
7928 if (ScalarArgs[J] != A1J) {
7930 <<
"SLP: mismatched arguments in call:" << *CI
7931 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7932 return TreeEntry::NeedToGather;
7941 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7942 <<
"!=" << *V <<
'\n');
7943 return TreeEntry::NeedToGather;
7947 return TreeEntry::Vectorize;
7949 case Instruction::ShuffleVector: {
7950 if (!S.isAltShuffle()) {
7953 return TreeEntry::Vectorize;
7956 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7957 return TreeEntry::NeedToGather;
7962 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7963 "the whole alt sequence is not profitable.\n");
7964 return TreeEntry::NeedToGather;
7967 return TreeEntry::Vectorize;
7971 return TreeEntry::NeedToGather;
7985 PHIHandler() =
delete;
7987 : DT(DT), Main(Main), Phis(Phis),
7988 Operands(Main->getNumIncomingValues(),
7990 void buildOperands() {
7991 constexpr unsigned FastLimit = 4;
8001 auto *
P = dyn_cast<PHINode>(V);
8003 assert(isa<PoisonValue>(V) &&
8004 "Expected isa instruction or poison value.");
8008 if (
P->getIncomingBlock(
I) == InBB)
8023 Blocks.try_emplace(InBB).first->second.push_back(
I);
8026 if (isa<PoisonValue>(V)) {
8031 auto *
P = cast<PHINode>(V);
8032 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
8040 auto It =
Blocks.find(InBB);
8046 for (
const auto &
P :
Blocks) {
8047 if (
P.getSecond().size() <= 1)
8049 unsigned BasicI =
P.getSecond().front();
8052 [&](
const auto &Data) {
8053 return !Data.value() ||
8054 Data.value() ==
Operands[BasicI][Data.index()];
8056 "Expected empty operands list.");
8066 const EdgeInfo &UserTreeIdx,
8067 unsigned InterleaveFactor) {
8073 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8074 bool DoNotFail =
false) {
8077 for (
Value *V : VL) {
8084 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8089 size_t NumUniqueScalarValues = UniqueValues.
size();
8092 if (NumUniqueScalarValues == VL.size() &&
8094 ReuseShuffleIndices.
clear();
8097 if ((UserTreeIdx.UserTE &&
8098 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8100 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8101 "for nodes with padding.\n");
8102 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8106 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8107 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8110 if (DoNotFail && UniquePositions.size() > 1 &&
8111 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8112 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8115 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8116 if (PWSz == VL.size()) {
8117 ReuseShuffleIndices.
clear();
8119 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8121 PWSz - UniqueValues.
size(),
8127 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8130 VL = NonUniqueValueVL;
8135 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8148 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8150 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8156 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8157 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp()
8159 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8160 auto It = MultiNodeScalars.
find(S.getMainOp());
8161 if (It != MultiNodeScalars.
end()) {
8162 auto *TEIt =
find_if(It->getSecond(),
8163 [&](TreeEntry *ME) { return ME->isSame(VL); });
8164 if (TEIt != It->getSecond().end())
8174 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
8175 if (TryToFindDuplicates(S))
8176 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8177 ReuseShuffleIndices);
8181 Nodes.
insert(getTreeEntry(S.getMainOp()));
8182 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8185 if (
any_of(Nodes, [&](
const TreeEntry *E) {
8187 [&](
Value *V) { return Values.contains(V); }))
8192 all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8195 if (TryToFindDuplicates(S))
8196 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8197 ReuseShuffleIndices);
8204 E->UserTreeIndices.push_back(UserTreeIdx);
8205 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8216 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8221 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8223 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8224 if (TryToFindDuplicates(S))
8225 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8226 ReuseShuffleIndices);
8231 if (S && S.getOpcode() == Instruction::ExtractElement &&
8232 isa<ScalableVectorType>(
8233 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8234 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8235 if (TryToFindDuplicates(S))
8236 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8237 ReuseShuffleIndices);
8244 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8253 auto &&NotProfitableForVectorization = [&S,
this,
8255 if (!S || !S.isAltShuffle() || VL.size() > 2)
8264 for (
Value *V : VL) {
8265 auto *
I = cast<Instruction>(V);
8267 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8270 bool IsCommutative =
8272 if ((IsCommutative &&
8273 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8275 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8277 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8279 auto *
I1 = cast<Instruction>(VL.front());
8280 auto *I2 = cast<Instruction>(VL.back());
8281 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8283 I2->getOperand(
Op));
8284 if (
static_cast<unsigned>(
count_if(
8285 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8287 })) >= S.getMainOp()->getNumOperands() / 2)
8289 if (S.getMainOp()->getNumOperands() > 2)
8291 if (IsCommutative) {
8296 I2->getOperand((
Op + 1) % E));
8298 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8307 bool IsScatterVectorizeUserTE =
8308 UserTreeIdx.UserTE &&
8309 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8311 bool AreScatterAllGEPSameBlock =
8312 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8316 auto *
I = dyn_cast<GetElementPtrInst>(V);
8320 BB =
I->getParent();
8321 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8324 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8326 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8329 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8332 NotProfitableForVectorization(VL)) {
8333 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8334 if (TryToFindDuplicates(S))
8335 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8336 ReuseShuffleIndices);
8341 if (S && !EphValues.
empty()) {
8342 for (
Value *V : VL) {
8343 if (EphValues.
count(V)) {
8345 <<
") is ephemeral.\n");
8346 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8356 for (
Value *V : VL) {
8357 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8360 if (getTreeEntry(V)) {
8362 <<
") is already in tree.\n");
8363 if (TryToFindDuplicates(S))
8364 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8365 ReuseShuffleIndices);
8371 if (UserIgnoreList && !UserIgnoreList->empty()) {
8372 for (
Value *V : VL) {
8373 if (UserIgnoreList->contains(V)) {
8374 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8375 if (TryToFindDuplicates(S))
8376 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8377 ReuseShuffleIndices);
8385 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8386 assert(VL.front()->getType()->isPointerTy() &&
8387 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8388 "Expected pointers only.");
8390 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8391 assert(It != VL.end() &&
"Expected at least one GEP.");
8408 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8413 if (!TryToFindDuplicates(S,
true))
8419 TreeEntry::EntryState State = getScalarsVectorizationState(
8420 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8421 if (State == TreeEntry::NeedToGather) {
8422 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8423 ReuseShuffleIndices);
8427 auto &BSRef = BlocksSchedules[BB];
8429 BSRef = std::make_unique<BlockScheduling>(BB);
8431 BlockScheduling &BS = *BSRef;
8433 std::optional<ScheduleData *> Bundle =
8434 BS.tryScheduleBundle(UniqueValues,
this, S);
8435#ifdef EXPENSIVE_CHECKS
8440 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8441 assert((!BS.getScheduleData(VL0) ||
8442 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8443 "tryScheduleBundle should cancelScheduling on failure");
8444 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8445 ReuseShuffleIndices);
8446 NonScheduledFirst.insert(VL.front());
8447 if (S.getOpcode() == Instruction::Load &&
8448 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8452 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8454 unsigned ShuffleOrOp =
8455 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8456 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8459 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8464 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8469 for (
unsigned I : PHIOps)
8472 switch (ShuffleOrOp) {
8473 case Instruction::PHI: {
8474 auto *PH = cast<PHINode>(VL0);
8477 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8482 PHIHandler Handler(*DT, PH, VL);
8483 Handler.buildOperands();
8484 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8485 TE->setOperand(
I, Handler.getOperands(
I));
8487 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8492 case Instruction::ExtractValue:
8493 case Instruction::ExtractElement: {
8494 if (CurrentOrder.empty()) {
8495 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8498 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8500 for (
unsigned Idx : CurrentOrder)
8508 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8509 ReuseShuffleIndices, CurrentOrder);
8511 "(ExtractValueInst/ExtractElementInst).\n";
8515 TE->setOperand(*
this);
8518 case Instruction::InsertElement: {
8519 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8521 auto OrdCompare = [](
const std::pair<int, int> &
P1,
8522 const std::pair<int, int> &P2) {
8523 return P1.first > P2.first;
8526 decltype(OrdCompare)>
8527 Indices(OrdCompare);
8528 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8530 Indices.emplace(
Idx,
I);
8532 OrdersType CurrentOrder(VL.size(), VL.size());
8533 bool IsIdentity =
true;
8534 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8535 CurrentOrder[Indices.top().second] =
I;
8536 IsIdentity &= Indices.top().second ==
I;
8540 CurrentOrder.clear();
8541 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8543 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8546 TE->setOperand(*
this);
8547 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8550 case Instruction::Load: {
8557 TreeEntry *
TE =
nullptr;
8560 case TreeEntry::Vectorize:
8561 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8562 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8563 if (CurrentOrder.empty())
8568 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8571 case TreeEntry::StridedVectorize:
8573 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8574 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8575 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8578 case TreeEntry::ScatterVectorize:
8580 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8581 UserTreeIdx, ReuseShuffleIndices);
8584 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8587 case TreeEntry::CombinedVectorize:
8588 case TreeEntry::NeedToGather:
8591 TE->setOperand(*
this);
8592 if (State == TreeEntry::ScatterVectorize)
8593 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8596 case Instruction::ZExt:
8597 case Instruction::SExt:
8598 case Instruction::FPToUI:
8599 case Instruction::FPToSI:
8600 case Instruction::FPExt:
8601 case Instruction::PtrToInt:
8602 case Instruction::IntToPtr:
8603 case Instruction::SIToFP:
8604 case Instruction::UIToFP:
8605 case Instruction::Trunc:
8606 case Instruction::FPTrunc:
8607 case Instruction::BitCast: {
8608 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8609 std::make_pair(std::numeric_limits<unsigned>::min(),
8610 std::numeric_limits<unsigned>::max()));
8611 if (ShuffleOrOp == Instruction::ZExt ||
8612 ShuffleOrOp == Instruction::SExt) {
8613 CastMaxMinBWSizes = std::make_pair(
8619 }
else if (ShuffleOrOp == Instruction::Trunc) {
8620 CastMaxMinBWSizes = std::make_pair(
8627 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8628 ReuseShuffleIndices);
8632 TE->setOperand(*
this);
8634 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8635 if (ShuffleOrOp == Instruction::Trunc) {
8636 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8637 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8638 ShuffleOrOp == Instruction::UIToFP) {
8639 unsigned NumSignBits =
8641 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8643 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8645 if (NumSignBits * 2 >=
8647 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8651 case Instruction::ICmp:
8652 case Instruction::FCmp: {
8655 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8656 ReuseShuffleIndices);
8661 VLOperands Ops(VL, S, *
this);
8666 "Commutative Predicate mismatch");
8668 Left = Ops.getVL(0);
8669 Right = Ops.getVL(1);
8672 for (
Value *V : VL) {
8673 if (isa<PoisonValue>(V)) {
8678 auto *
Cmp = cast<CmpInst>(V);
8681 if (
Cmp->getPredicate() != P0)
8683 Left.push_back(LHS);
8684 Right.push_back(RHS);
8691 if (ShuffleOrOp == Instruction::ICmp) {
8692 unsigned NumSignBits0 =
8694 if (NumSignBits0 * 2 >=
8696 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8697 unsigned NumSignBits1 =
8699 if (NumSignBits1 * 2 >=
8701 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8705 case Instruction::Select:
8706 case Instruction::FNeg:
8707 case Instruction::Add:
8708 case Instruction::FAdd:
8709 case Instruction::Sub:
8710 case Instruction::FSub:
8711 case Instruction::Mul:
8712 case Instruction::FMul:
8713 case Instruction::UDiv:
8714 case Instruction::SDiv:
8715 case Instruction::FDiv:
8716 case Instruction::URem:
8717 case Instruction::SRem:
8718 case Instruction::FRem:
8719 case Instruction::Shl:
8720 case Instruction::LShr:
8721 case Instruction::AShr:
8722 case Instruction::And:
8723 case Instruction::Or:
8724 case Instruction::Xor:
8725 case Instruction::Freeze: {
8726 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8727 ReuseShuffleIndices);
8729 dbgs() <<
"SLP: added a new TreeEntry "
8730 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8733 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8735 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8738 case Instruction::GetElementPtr: {
8739 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8740 ReuseShuffleIndices);
8741 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8745 for (
Value *V : VL) {
8746 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8751 Operands.front().push_back(
GEP->getPointerOperand());
8762 [VL0Ty, IndexIdx](
Value *V) {
8763 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8766 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8770 ->getPointerOperandType()
8773 for (
Value *V : VL) {
8774 auto *
I = dyn_cast<GetElementPtrInst>(V);
8777 ConstantInt::get(Ty, 0,
false));
8780 auto *
Op =
I->getOperand(IndexIdx);
8781 auto *CI = dyn_cast<ConstantInt>(
Op);
8786 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8790 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8794 case Instruction::Store: {
8795 bool Consecutive = CurrentOrder.empty();
8798 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8799 ReuseShuffleIndices, CurrentOrder);
8801 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8805 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8807 TE->setOperand(*
this);
8808 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8811 case Instruction::Call: {
8814 CallInst *CI = cast<CallInst>(VL0);
8817 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8818 ReuseShuffleIndices);
8822 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8827 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8831 case Instruction::ShuffleVector: {
8832 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8833 ReuseShuffleIndices);
8834 if (S.isAltShuffle()) {
8835 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8840 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8845 auto *CI = dyn_cast<CmpInst>(VL0);
8847 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8849 auto *MainCI = cast<CmpInst>(S.getMainOp());
8850 auto *AltCI = cast<CmpInst>(S.getAltOp());
8854 "Expected different main/alternate predicates.");
8858 for (
Value *V : VL) {
8859 if (isa<PoisonValue>(V)) {
8864 auto *
Cmp = cast<CmpInst>(V);
8875 Left.push_back(LHS);
8876 Right.push_back(RHS);
8885 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8887 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8900 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8903 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8905 for (
const auto *Ty : ST->elements())
8906 if (Ty != *ST->element_begin())
8908 N *= ST->getNumElements();
8909 EltTy = *ST->element_begin();
8910 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8911 N *= AT->getNumElements();
8912 EltTy = AT->getElementType();
8914 auto *VT = cast<FixedVectorType>(EltTy);
8915 N *= VT->getNumElements();
8916 EltTy = VT->getElementType();
8923 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8931 bool ResizeAllowed)
const {
8932 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8933 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8934 auto *E0 = cast<Instruction>(*It);
8936 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8940 Value *Vec = E0->getOperand(0);
8942 CurrentOrder.
clear();
8946 if (E0->getOpcode() == Instruction::ExtractValue) {
8951 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8955 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8958 unsigned E = VL.
size();
8959 if (!ResizeAllowed && NElts != E)
8962 unsigned MinIdx = NElts, MaxIdx = 0;
8964 auto *Inst = dyn_cast<Instruction>(V);
8967 if (Inst->getOperand(0) != Vec)
8969 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8970 if (isa<UndefValue>(EE->getIndexOperand()))
8975 const unsigned ExtIdx = *
Idx;
8976 if (ExtIdx >= NElts)
8978 Indices[
I] = ExtIdx;
8979 if (MinIdx > ExtIdx)
8981 if (MaxIdx < ExtIdx)
8984 if (MaxIdx - MinIdx + 1 > E)
8986 if (MaxIdx + 1 <= E)
8990 bool ShouldKeepOrder =
true;
8996 CurrentOrder.
assign(E, E);
8997 for (
unsigned I = 0;
I < E; ++
I) {
9000 const unsigned ExtIdx = Indices[
I] - MinIdx;
9001 if (CurrentOrder[ExtIdx] != E) {
9002 CurrentOrder.
clear();
9005 ShouldKeepOrder &= ExtIdx ==
I;
9006 CurrentOrder[ExtIdx] =
I;
9008 if (ShouldKeepOrder)
9009 CurrentOrder.
clear();
9011 return ShouldKeepOrder;
9014bool BoUpSLP::areAllUsersVectorized(
9016 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
9018 return ScalarToTreeEntry.contains(U) ||
9019 isVectorLikeInstWithConstOps(U) ||
9020 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9024static std::pair<InstructionCost, InstructionCost>
9032 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9033 FMF = FPCI->getFastMathFlags();
9036 dyn_cast<IntrinsicInst>(CI));
9037 auto IntrinsicCost =
9044 auto LibCost = IntrinsicCost;
9051 return {IntrinsicCost, LibCost};
9054void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9058 unsigned Sz = Scalars.size();
9061 if (!ReorderIndices.empty())
9063 for (
unsigned I = 0;
I < Sz; ++
I) {
9065 if (!ReorderIndices.empty())
9067 if (isa<PoisonValue>(Scalars[
Idx]))
9069 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9070 if (IsAltOp(OpInst)) {
9080 if (!ReuseShuffleIndices.
empty()) {
9083 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9093 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9094 auto *AltCI = cast<CmpInst>(AltOp);
9097 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9098 auto *CI = cast<CmpInst>(
I);
9106 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9107 "CmpInst expected to match either main or alternate predicate or "
9109 return MainP !=
P && MainP != SwappedP;
9116 const auto *Op0 = Ops.
front();
9122 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9126 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9128 if (
auto *CI = dyn_cast<ConstantInt>(V))
9129 return CI->getValue().isPowerOf2();
9132 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9134 if (
auto *CI = dyn_cast<ConstantInt>(V))
9135 return CI->getValue().isNegatedPowerOf2();
9140 if (IsConstant && IsUniform)
9142 else if (IsConstant)
9156class BaseShuffleAnalysis {
9158 Type *ScalarTy =
nullptr;
9160 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9168 unsigned getVF(
Value *V)
const {
9169 assert(V &&
"V cannot be nullptr");
9170 assert(isa<FixedVectorType>(
V->getType()) &&
9171 "V does not have FixedVectorType");
9172 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9174 unsigned VNumElements =
9175 cast<FixedVectorType>(
V->getType())->getNumElements();
9176 assert(VNumElements > ScalarTyNumElements &&
9177 "the number of elements of V is not large enough");
9178 assert(VNumElements % ScalarTyNumElements == 0 &&
9179 "the number of elements of V is not a vectorized value");
9180 return VNumElements / ScalarTyNumElements;
9188 int Limit =
Mask.size();
9200 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9216 unsigned VF =
Mask.size();
9218 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9221 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9262 bool SinglePermute) {
9266 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9268 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9274 if (isIdentityMask(Mask, SVTy,
false)) {
9275 if (!IdentityOp || !SinglePermute ||
9276 (isIdentityMask(Mask, SVTy,
true) &&
9278 IdentityMask.
size()))) {
9283 IdentityMask.
assign(Mask);
9303 if (SV->isZeroEltSplat()) {
9305 IdentityMask.
assign(Mask);
9307 int LocalVF =
Mask.size();
9309 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9310 LocalVF = SVOpTy->getNumElements();
9314 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9316 ExtMask[
Idx] = SV->getMaskValue(
I);
9326 if (!IsOp1Undef && !IsOp2Undef) {
9328 for (
int &
I : Mask) {
9331 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9338 combineMasks(LocalVF, ShuffleMask, Mask);
9339 Mask.swap(ShuffleMask);
9341 Op = SV->getOperand(0);
9343 Op = SV->getOperand(1);
9345 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9346 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9351 "Expected masks of same sizes.");
9356 Mask.swap(IdentityMask);
9357 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9358 return SinglePermute &&
9359 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9361 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9362 Shuffle->isZeroEltSplat() &&
9375 template <
typename T,
typename ShuffleBuilderTy>
9377 ShuffleBuilderTy &Builder) {
9378 assert(V1 &&
"Expected at least one vector value.");
9380 Builder.resizeToMatch(V1, V2);
9381 int VF =
Mask.size();
9382 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9383 VF = FTy->getNumElements();
9384 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9391 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9394 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9396 CombinedMask1[
I] =
Mask[
I];
9398 CombinedMask2[
I] =
Mask[
I] - VF;
9405 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9406 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9409 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9410 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9415 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9418 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9420 ExtMask1, UseMask::SecondArg);
9425 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9428 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9430 ExtMask2, UseMask::SecondArg);
9431 if (SV1->getOperand(0)->getType() ==
9432 SV2->getOperand(0)->getType() &&
9433 SV1->getOperand(0)->getType() != SV1->getType() &&
9436 Op1 = SV1->getOperand(0);
9437 Op2 = SV2->getOperand(0);
9439 int LocalVF = ShuffleMask1.size();
9440 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9441 LocalVF = FTy->getNumElements();
9442 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9443 CombinedMask1.swap(ShuffleMask1);
9445 LocalVF = ShuffleMask2.size();
9446 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9447 LocalVF = FTy->getNumElements();
9448 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9449 CombinedMask2.swap(ShuffleMask2);
9452 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9453 Builder.resizeToMatch(Op1, Op2);
9454 VF = std::max(cast<VectorType>(Op1->
getType())
9456 .getKnownMinValue(),
9457 cast<VectorType>(Op2->
getType())
9459 .getKnownMinValue());
9460 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9463 "Expected undefined mask element");
9464 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9470 isa<ShuffleVectorInst>(Op1) &&
9471 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9473 return Builder.createIdentity(Op1);
9474 return Builder.createShuffleVector(
9478 if (isa<PoisonValue>(V1))
9479 return Builder.createPoison(
9480 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9482 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9483 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9486 return Builder.createShuffleVector(V1, NewMask);
9487 return Builder.createIdentity(V1);
9494 for (
unsigned I : seq<unsigned>(CommonMask.
size()))
9502static std::pair<InstructionCost, InstructionCost>
9513 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9523 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9527 for (
Value *V : Ptrs) {
9532 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9537 if (!
Ptr || !
Ptr->hasOneUse())
9541 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9547 TTI::PointersChainInfo::getKnownStride(),
9557 [](
const Value *V) {
9558 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9559 return Ptr && !
Ptr->hasAllConstantIndices();
9561 ? TTI::PointersChainInfo::getUnknownStride()
9562 : TTI::PointersChainInfo::getKnownStride();
9566 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9568 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9569 if (It != Ptrs.
end())
9570 BaseGEP = cast<GEPOperator>(*It);
9575 BaseGEP->getPointerOperand(), Indices, VecTy,
9580 return std::make_pair(ScalarCost, VecCost);
9583void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9584 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9585 "Expected gather node without reordering.");
9591 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
9595 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9596 return VectorizableTree[Idx]->isSame(TE.Scalars);
9600 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9605 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9606 if (LIt != LoadsMap.
end()) {
9607 for (
LoadInst *RLI : LIt->second) {
9613 for (
LoadInst *RLI : LIt->second) {
9620 if (LIt->second.size() > 2) {
9622 hash_value(LIt->second.back()->getPointerOperand());
9628 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9633 bool IsOrdered =
true;
9634 unsigned NumInstructions = 0;
9639 if (
auto *Inst = dyn_cast<Instruction>(V);
9640 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9646 auto &Container = SortedValues[
Key];
9647 if (IsOrdered && !KeyToIndex.
contains(V) &&
9648 !(isa<Constant, ExtractElementInst>(V) ||
9650 ((Container.contains(
Idx) &&
9651 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9652 (!Container.empty() && !Container.contains(
Idx) &&
9653 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9655 auto &KTI = KeyToIndex[
V];
9657 Container[
Idx].push_back(V);
9662 if (!IsOrdered && NumInstructions > 1) {
9664 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9665 for (
const auto &
D : SortedValues) {
9666 for (
const auto &
P :
D.second) {
9668 for (
Value *V :
P.second) {
9671 TE.ReorderIndices[Cnt +
K] =
Idx;
9672 TE.Scalars[Cnt +
K] =
V;
9674 Sz += Indices.
size();
9675 Cnt += Indices.
size();
9677 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9679 *
TTI,
TE.Scalars.front()->getType(), Sz);
9681 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9683 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9684 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9691 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9696 auto *ScalarTy =
TE.Scalars.front()->getType();
9698 for (
auto [
Idx, Sz] : SubVectors) {
9702 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9707 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9708 if (DemandedElts[
I])
9711 CostKind,
I * ScalarTyNumElements, FTy);
9716 int Sz =
TE.Scalars.size();
9718 TE.ReorderIndices.end());
9719 for (
unsigned I : seq<unsigned>(Sz)) {
9721 if (isa<PoisonValue>(V)) {
9724 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9728 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9731 VecTy, ReorderMask);
9734 for (
unsigned I : seq<unsigned>(Sz)) {
9738 if (!isa<PoisonValue>(V))
9741 ReorderMask[
I] =
I + Sz;
9745 VecTy, DemandedElts,
true,
false,
CostKind);
9748 if (
Cost >= BVCost) {
9751 TE.ReorderIndices.clear();
9757 BaseGraphSize = VectorizableTree.size();
9759 class GraphTransformModeRAAI {
9760 bool &SavedIsGraphTransformMode;
9763 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9764 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9765 IsGraphTransformMode =
true;
9767 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9768 } TransformContext(IsGraphTransformMode);
9777 const InstructionsState &S) {
9779 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9781 I2->getOperand(
Op));
9783 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9785 [](
const std::pair<Value *, Value *> &
P) {
9786 return isa<Constant>(
P.first) ||
9787 isa<Constant>(
P.second) ||
P.first ==
P.second;
9794 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9795 TreeEntry &E = *VectorizableTree[
Idx];
9797 reorderGatherNode(E);
9801 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9802 TreeEntry &E = *VectorizableTree[
Idx];
9809 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9810 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9816 unsigned StartIdx = 0;
9821 *
TTI, VL.
front()->getType(), VF - 1)) {
9822 if (StartIdx + VF >
End)
9825 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9829 if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9830 SE || getTreeEntry(Slice.
back())) {
9833 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9841 bool IsSplat =
isSplat(Slice);
9842 if (Slices.
empty() || !IsSplat ||
9844 Slice.
front()->getType(), VF)),
9847 Slice.
front()->getType(), 2 * VF)),
9850 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9856 (S.getOpcode() == Instruction::Load &&
9858 (S.getOpcode() != Instruction::Load &&
9864 if ((!UserIgnoreList || E.Idx != 0) &&
9868 if (isa<PoisonValue>(V))
9870 return areAllUsersVectorized(cast<Instruction>(V),
9874 if (S.getOpcode() == Instruction::Load) {
9886 if (UserIgnoreList && E.Idx == 0)
9891 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9894 !CheckOperandsProfitability(
9897 IsaPred<Instruction>)),
9908 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9909 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9910 if (StartIdx == Cnt)
9911 StartIdx = Cnt + Sz;
9912 if (
End == Cnt + Sz)
9915 for (
auto [Cnt, Sz] : Slices) {
9918 if (TreeEntry *SE = getTreeEntry(Slice.
front());
9919 SE || getTreeEntry(Slice.
back())) {
9922 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9924 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9925 AddCombinedNode(SE->Idx, Cnt, Sz);
9928 unsigned PrevSize = VectorizableTree.size();
9929 [[maybe_unused]]
unsigned PrevEntriesSize =
9930 LoadEntriesToVectorize.size();
9931 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9932 if (PrevSize + 1 == VectorizableTree.size() &&
9933 VectorizableTree[PrevSize]->isGather() &&
9934 VectorizableTree[PrevSize]->hasState() &&
9935 VectorizableTree[PrevSize]->getOpcode() !=
9936 Instruction::ExtractElement &&
9938 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9940 VectorizableTree.pop_back();
9941 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9942 "LoadEntriesToVectorize expected to remain the same");
9945 AddCombinedNode(PrevSize, Cnt, Sz);
9949 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9952 E.ReorderIndices.clear();
9957 switch (E.getOpcode()) {
9958 case Instruction::Load: {
9961 if (E.State != TreeEntry::Vectorize)
9963 Type *ScalarTy = E.getMainOp()->getType();
9965 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9968 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9972 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9979 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9980 false, CommonAlignment,
CostKind, BaseLI);
9981 if (StridedCost < OriginalVecCost)
9984 E.State = TreeEntry::StridedVectorize;
9988 case Instruction::Store: {
9990 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9992 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9995 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9999 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
10006 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10007 false, CommonAlignment,
CostKind, BaseSI);
10008 if (StridedCost < OriginalVecCost)
10011 E.State = TreeEntry::StridedVectorize;
10012 }
else if (!E.ReorderIndices.empty()) {
10015 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10016 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
10017 if (Mask.size() < 4)
10019 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10023 VecTy, Factor, BaseSI->getAlign(),
10031 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10032 if (InterleaveFactor != 0)
10033 E.setInterleave(InterleaveFactor);
10037 case Instruction::Select: {
10038 if (E.State != TreeEntry::Vectorize)
10044 E.CombinedOp = TreeEntry::MinMax;
10045 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
10046 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10047 CondEntry->State == TreeEntry::Vectorize) {
10049 CondEntry->State = TreeEntry::CombinedVectorize;
10058 if (LoadEntriesToVectorize.empty()) {
10060 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10061 VectorizableTree.front()->getOpcode() == Instruction::Load)
10064 constexpr unsigned SmallTree = 3;
10065 constexpr unsigned SmallVF = 2;
10066 if ((VectorizableTree.size() <= SmallTree &&
10067 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10068 (VectorizableTree.size() <= 2 && UserIgnoreList))
10071 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10075 [](
const std::unique_ptr<TreeEntry> &TE) {
10076 return TE->isGather() && TE->hasState() &&
10077 TE->getOpcode() == Instruction::Load &&
10089 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10090 TreeEntry &E = *TE;
10091 if (E.isGather() &&
10092 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10093 (!E.hasState() &&
any_of(E.Scalars,
10095 return isa<LoadInst>(V) &&
10096 !isVectorized(V) &&
10097 !isDeleted(cast<Instruction>(V));
10100 for (
Value *V : E.Scalars) {
10101 auto *LI = dyn_cast<LoadInst>(V);
10107 *
this, V, *DL, *SE, *
TTI,
10108 GatheredLoads[std::make_tuple(
10116 if (!GatheredLoads.
empty())
10117 tryToVectorizeGatheredLoads(GatheredLoads);
10127 bool IsFinalized =
false;
10140 bool SameNodesEstimated =
true;
10149 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10165 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10166 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10169 count(VL, *It) > 1 &&
10171 if (!NeedShuffle) {
10172 if (isa<FixedVectorType>(ScalarTy)) {
10177 cast<FixedVectorType>(ScalarTy));
10180 CostKind, std::distance(VL.
begin(), It),
10186 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10193 VecTy, ShuffleMask, CostKind,
10197 return GatherCost +
10198 (
all_of(Gathers, IsaPred<UndefValue>)
10200 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10208 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10209 unsigned NumParts) {
10210 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10212 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10213 auto *EE = dyn_cast<ExtractElementInst>(V);
10216 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10219 return std::max(Sz, VecTy->getNumElements());
10225 -> std::optional<TTI::ShuffleKind> {
10226 if (NumElts <= EltsPerVector)
10227 return std::nullopt;
10229 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10231 if (I == PoisonMaskElem)
10233 return std::min(S, I);
10236 int OffsetReg1 = OffsetReg0;
10240 int FirstRegId = -1;
10241 Indices.assign(1, OffsetReg0);
10245 int Idx =
I - OffsetReg0;
10247 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10248 if (FirstRegId < 0)
10249 FirstRegId = RegId;
10250 RegIndices.
insert(RegId);
10251 if (RegIndices.
size() > 2)
10252 return std::nullopt;
10253 if (RegIndices.
size() == 2) {
10255 if (Indices.
size() == 1) {
10258 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10259 [&](
int S,
int I) {
10260 if (I == PoisonMaskElem)
10262 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10263 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10264 if (RegId == FirstRegId)
10266 return std::min(S, I);
10269 Indices.push_back(OffsetReg1 % NumElts);
10271 Idx =
I - OffsetReg1;
10273 I = (
Idx % NumElts) % EltsPerVector +
10274 (RegId == FirstRegId ? 0 : EltsPerVector);
10276 return ShuffleKind;
10283 for (
unsigned Part : seq<unsigned>(NumParts)) {
10284 if (!ShuffleKinds[Part])
10287 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10291 std::optional<TTI::ShuffleKind> RegShuffleKind =
10292 CheckPerRegistersShuffle(SubMask, Indices);
10293 if (!RegShuffleKind) {
10296 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10309 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10310 for (
unsigned Idx : Indices) {
10311 assert((
Idx + EltsPerVector) <= BaseVF &&
10312 "SK_ExtractSubvector index out of range");
10323 if (OriginalCost <
Cost)
10324 Cost = OriginalCost;
10331 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10333 unsigned SliceSize) {
10334 if (SameNodesEstimated) {
10340 if ((InVectors.
size() == 2 &&
10341 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10342 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10343 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10344 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10347 "Expected all poisoned elements.");
10349 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10354 Cost += createShuffle(InVectors.
front(),
10355 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10357 transformMaskAfterShuffle(CommonMask, CommonMask);
10358 }
else if (InVectors.
size() == 2) {
10359 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10360 transformMaskAfterShuffle(CommonMask, CommonMask);
10362 SameNodesEstimated =
false;
10363 if (!E2 && InVectors.
size() == 1) {
10364 unsigned VF = E1.getVectorFactor();
10367 cast<FixedVectorType>(V1->
getType())->getNumElements());
10369 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10370 VF = std::max(VF, E->getVectorFactor());
10372 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10374 CommonMask[
Idx] = Mask[
Idx] + VF;
10375 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10376 transformMaskAfterShuffle(CommonMask, CommonMask);
10378 auto P = InVectors.
front();
10379 Cost += createShuffle(&E1, E2, Mask);
10380 unsigned VF = Mask.size();
10385 const auto *E = cast<const TreeEntry *>(
P);
10386 VF = std::max(VF, E->getVectorFactor());
10388 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10390 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10391 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10392 transformMaskAfterShuffle(CommonMask, CommonMask);
10396 class ShuffleCostBuilder {
10399 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10401 return Mask.empty() ||
10402 (VF == Mask.size() &&
10410 ~ShuffleCostBuilder() =
default;
10415 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10416 if (isEmptyOrIdentity(Mask, VF))
10419 cast<VectorType>(V1->
getType()), Mask);
10424 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10425 if (isEmptyOrIdentity(Mask, VF))
10428 cast<VectorType>(V1->
getType()), Mask);
10434 void resizeToMatch(
Value *&,
Value *&)
const {}
10444 ShuffleCostBuilder Builder(
TTI);
10447 unsigned CommonVF = Mask.size();
10449 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10453 Type *EScalarTy = E.Scalars.front()->getType();
10454 bool IsSigned =
true;
10455 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10457 IsSigned = It->second.second;
10459 if (EScalarTy != ScalarTy) {
10460 unsigned CastOpcode = Instruction::Trunc;
10461 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10462 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10464 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10472 if (isa<Constant>(V))
10474 auto *VecTy = cast<VectorType>(V->getType());
10476 if (EScalarTy != ScalarTy) {
10478 unsigned CastOpcode = Instruction::Trunc;
10479 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10480 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10482 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10489 if (!V1 && !V2 && !P2.
isNull()) {
10491 const TreeEntry *E = cast<const TreeEntry *>(P1);
10492 unsigned VF = E->getVectorFactor();
10493 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10494 CommonVF = std::max(VF, E2->getVectorFactor());
10497 return Idx < 2 * static_cast<int>(CommonVF);
10499 "All elements in mask must be less than 2 * CommonVF.");
10500 if (E->Scalars.size() == E2->Scalars.size()) {
10504 for (
int &
Idx : CommonMask) {
10507 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10509 else if (
Idx >=
static_cast<int>(CommonVF))
10510 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10514 CommonVF = E->Scalars.size();
10515 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10516 GetNodeMinBWAffectedCost(*E2, CommonVF);
10518 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10519 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10522 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10523 }
else if (!V1 && P2.
isNull()) {
10525 const TreeEntry *E = cast<const TreeEntry *>(P1);
10526 unsigned VF = E->getVectorFactor();
10530 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10531 "All elements in mask must be less than CommonVF.");
10532 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10534 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10535 for (
int &
Idx : CommonMask) {
10539 CommonVF = E->Scalars.size();
10540 }
else if (
unsigned Factor = E->getInterleaveFactor();
10541 Factor > 0 && E->Scalars.size() != Mask.size() &&
10545 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10547 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10550 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10551 CommonVF == CommonMask.
size() &&
10553 [](
const auto &&
P) {
10555 static_cast<unsigned>(
P.value()) !=
P.index();
10563 }
else if (V1 && P2.
isNull()) {
10565 ExtraCost += GetValueMinBWAffectedCost(V1);
10566 CommonVF = getVF(V1);
10569 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10570 "All elements in mask must be less than CommonVF.");
10571 }
else if (V1 && !V2) {
10573 unsigned VF = getVF(V1);
10574 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10575 CommonVF = std::max(VF, E2->getVectorFactor());
10578 return Idx < 2 * static_cast<int>(CommonVF);
10580 "All elements in mask must be less than 2 * CommonVF.");
10581 if (E2->Scalars.size() == VF && VF != CommonVF) {
10583 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10584 for (
int &
Idx : CommonMask) {
10587 if (
Idx >=
static_cast<int>(CommonVF))
10588 Idx = E2Mask[
Idx - CommonVF] + VF;
10592 ExtraCost += GetValueMinBWAffectedCost(V1);
10594 ExtraCost += GetNodeMinBWAffectedCost(
10595 *E2, std::min(CommonVF, E2->getVectorFactor()));
10596 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10597 }
else if (!V1 && V2) {
10599 unsigned VF = getVF(V2);
10600 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10601 CommonVF = std::max(VF, E1->getVectorFactor());
10604 return Idx < 2 * static_cast<int>(CommonVF);
10606 "All elements in mask must be less than 2 * CommonVF.");
10607 if (E1->Scalars.size() == VF && VF != CommonVF) {
10609 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10610 for (
int &
Idx : CommonMask) {
10613 if (
Idx >=
static_cast<int>(CommonVF))
10614 Idx = E1Mask[
Idx - CommonVF] + VF;
10620 ExtraCost += GetNodeMinBWAffectedCost(
10621 *E1, std::min(CommonVF, E1->getVectorFactor()));
10623 ExtraCost += GetValueMinBWAffectedCost(V2);
10624 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10626 assert(V1 && V2 &&
"Expected both vectors.");
10627 unsigned VF = getVF(V1);
10628 CommonVF = std::max(VF, getVF(V2));
10631 return Idx < 2 * static_cast<int>(CommonVF);
10633 "All elements in mask must be less than 2 * CommonVF.");
10635 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10636 if (V1->
getType() != V2->getType()) {
10638 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10640 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10642 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10643 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10646 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10651 InVectors.
front() =
10653 if (InVectors.
size() == 2)
10655 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10656 V1, V2, CommonMask, Builder);
10663 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10664 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10665 CheckedExtracts(CheckedExtracts) {}
10667 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10668 unsigned NumParts,
bool &UseVecBaseAsInput) {
10669 UseVecBaseAsInput =
false;
10672 Value *VecBase =
nullptr;
10674 if (!E->ReorderIndices.empty()) {
10676 E->ReorderIndices.end());
10681 bool PrevNodeFound =
any_of(
10683 [&](
const std::unique_ptr<TreeEntry> &TE) {
10684 return ((TE->hasState() && !TE->isAltShuffle() &&
10685 TE->getOpcode() == Instruction::ExtractElement) ||
10687 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10688 return VL.size() > Data.index() &&
10689 (Mask[Data.index()] == PoisonMaskElem ||
10690 isa<UndefValue>(VL[Data.index()]) ||
10691 Data.value() == VL[Data.index()]);
10696 for (
unsigned Part : seq<unsigned>(NumParts)) {
10698 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10702 if (isa<UndefValue>(V) ||
10711 auto *EE = cast<ExtractElementInst>(V);
10712 VecBase = EE->getVectorOperand();
10713 UniqueBases.
insert(VecBase);
10714 const TreeEntry *VE = R.getTreeEntry(V);
10715 if (!CheckedExtracts.
insert(V).second ||
10716 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10719 return isa<GetElementPtrInst>(U) &&
10720 !R.areAllUsersVectorized(cast<Instruction>(U),
10728 unsigned Idx = *EEIdx;
10730 if (EE->hasOneUse() || !PrevNodeFound) {
10732 if (isa<SExtInst, ZExtInst>(Ext) &&
10733 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10738 EE->getVectorOperandType(),
Idx);
10741 Ext->getOpcode(), Ext->getType(), EE->getType(),
10756 if (!PrevNodeFound)
10757 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10760 transformMaskAfterShuffle(CommonMask, CommonMask);
10761 SameNodesEstimated =
false;
10762 if (NumParts != 1 && UniqueBases.
size() != 1) {
10763 UseVecBaseAsInput =
true;
10771 std::optional<InstructionCost>
10775 return std::nullopt;
10781 return Idx < static_cast<int>(E1.getVectorFactor());
10783 "Expected single vector shuffle mask.");
10787 if (InVectors.
empty()) {
10788 CommonMask.
assign(Mask.begin(), Mask.end());
10789 InVectors.
assign({&E1, &E2});
10792 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10795 if (NumParts == 0 || NumParts >= Mask.size() ||
10796 MaskVecTy->getNumElements() % NumParts != 0 ||
10798 MaskVecTy->getNumElements() / NumParts))
10803 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10804 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10807 if (InVectors.
empty()) {
10808 CommonMask.
assign(Mask.begin(), Mask.end());
10809 InVectors.
assign(1, &E1);
10812 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10815 if (NumParts == 0 || NumParts >= Mask.size() ||
10816 MaskVecTy->getNumElements() % NumParts != 0 ||
10818 MaskVecTy->getNumElements() / NumParts))
10823 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10824 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10825 if (!SameNodesEstimated && InVectors.
size() == 1)
10837 auto *EI = cast<ExtractElementInst>(
10838 cast<const TreeEntry *>(InVectors.
front())
10839 ->getOrdered(
P.index()));
10840 return EI->getVectorOperand() == V1 ||
10841 EI->getVectorOperand() == V2;
10843 "Expected extractelement vectors.");
10847 if (InVectors.
empty()) {
10849 "Expected empty input mask/vectors.");
10850 CommonMask.
assign(Mask.begin(), Mask.end());
10851 InVectors.
assign(1, V1);
10856 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10857 !CommonMask.
empty() &&
10860 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10861 ->getOrdered(
P.index());
10863 return P.value() == Mask[
P.index()] ||
10864 isa<UndefValue>(Scalar);
10865 if (isa<Constant>(V1))
10867 auto *EI = cast<ExtractElementInst>(Scalar);
10868 return EI->getVectorOperand() == V1;
10870 "Expected only tree entry for extractelement vectors.");
10874 "Expected only tree entries from extracts/reused buildvectors.");
10875 unsigned VF = getVF(V1);
10876 if (InVectors.
size() == 2) {
10877 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10878 transformMaskAfterShuffle(CommonMask, CommonMask);
10879 VF = std::max<unsigned>(VF, CommonMask.
size());
10880 }
else if (
const auto *InTE =
10881 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10882 VF = std::max(VF, InTE->getVectorFactor());
10885 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10886 ->getNumElements());
10889 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10891 CommonMask[
Idx] = Mask[
Idx] + VF;
10894 Value *Root =
nullptr) {
10895 Cost += getBuildVectorCost(VL, Root);
10899 unsigned VF = VL.
size();
10901 VF = std::min(VF, MaskVF);
10903 if (isa<UndefValue>(V)) {
10909 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10916 Type *ScalarTy = V->getType()->getScalarType();
10918 if (isa<PoisonValue>(V))
10920 else if (isa<UndefValue>(V))
10924 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10927 Vals.
swap(NewVals);
10933 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10940 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10943 IsFinalized =
true;
10946 if (InVectors.
size() == 2)
10947 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10949 Cost += createShuffle(Vec,
nullptr, CommonMask);
10950 transformMaskAfterShuffle(CommonMask, CommonMask);
10952 "Expected vector length for the final value before action.");
10953 Value *V = cast<Value *>(Vec);
10954 Action(V, CommonMask);
10955 InVectors.
front() = V;
10957 if (!SubVectors.empty()) {
10959 if (InVectors.
size() == 2)
10960 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10962 Cost += createShuffle(Vec,
nullptr, CommonMask);
10963 transformMaskAfterShuffle(CommonMask, CommonMask);
10965 if (!SubVectorsMask.
empty()) {
10967 "Expected same size of masks for subvectors and common mask.");
10969 copy(SubVectorsMask, SVMask.begin());
10970 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10973 I1 = I2 + CommonMask.
size();
10980 for (
auto [E,
Idx] : SubVectors) {
10981 Type *EScalarTy = E->Scalars.front()->getType();
10982 bool IsSigned =
true;
10983 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10986 IsSigned = It->second.second;
10988 if (ScalarTy != EScalarTy) {
10989 unsigned CastOpcode = Instruction::Trunc;
10990 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10991 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10993 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
11003 if (!CommonMask.
empty()) {
11004 std::iota(std::next(CommonMask.
begin(),
Idx),
11005 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
11011 if (!ExtMask.
empty()) {
11012 if (CommonMask.
empty()) {
11016 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11019 NewMask[
I] = CommonMask[ExtMask[
I]];
11021 CommonMask.
swap(NewMask);
11024 if (CommonMask.
empty()) {
11025 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11029 createShuffle(InVectors.
front(),
11030 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11036 "Shuffle construction must be finalized.");
11040const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11041 unsigned Idx)
const {
11042 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11045 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11046 return TE->isGather() &&
11047 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11048 return EI.EdgeIdx == Idx && EI.UserTE == E;
11049 }) != TE->UserTreeIndices.end();
11051 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11056 if (TE.State == TreeEntry::ScatterVectorize ||
11057 TE.State == TreeEntry::StridedVectorize)
11059 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11060 !TE.isAltShuffle()) {
11061 if (TE.ReorderIndices.empty())
11075 const unsigned VF,
unsigned MinBW,
11107 auto It = MinBWs.
find(E);
11108 Type *OrigScalarTy = ScalarTy;
11109 if (It != MinBWs.
end()) {
11110 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11116 unsigned EntryVF = E->getVectorFactor();
11119 if (E->isGather()) {
11122 if (isa<InsertElementInst>(VL[0]))
11124 if (isa<CmpInst>(VL.
front()))
11125 ScalarTy = VL.
front()->getType();
11126 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11127 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11131 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11134 if (E->getOpcode() == Instruction::Store) {
11136 NewMask.
resize(E->ReorderIndices.size());
11137 copy(E->ReorderIndices, NewMask.
begin());
11143 if (!E->ReuseShuffleIndices.empty())
11144 ::addMask(Mask, E->ReuseShuffleIndices);
11148 assert((E->State == TreeEntry::Vectorize ||
11149 E->State == TreeEntry::ScatterVectorize ||
11150 E->State == TreeEntry::StridedVectorize) &&
11151 "Unhandled state");
11152 assert(E->getOpcode() &&
11154 (E->getOpcode() == Instruction::GetElementPtr &&
11155 E->getMainOp()->getType()->isPointerTy())) &&
11158 unsigned ShuffleOrOp =
11159 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11160 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11161 ShuffleOrOp = E->CombinedOp;
11163 const unsigned Sz = UniqueValues.
size();
11165 for (
unsigned I = 0;
I < Sz; ++
I) {
11166 if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11168 UsedScalars.set(
I);
11170 auto GetCastContextHint = [&](
Value *
V) {
11171 if (
const TreeEntry *OpTE = getTreeEntry(V))
11172 return getCastContextHint(*OpTE);
11173 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11174 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11175 !SrcState.isAltShuffle())
11184 if (isa<CastInst, CallInst>(VL0)) {
11188 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11190 for (
unsigned I = 0;
I < Sz; ++
I) {
11191 if (UsedScalars.test(
I))
11193 ScalarCost += ScalarEltCost(
I);
11202 (E->getOpcode() != Instruction::Load ||
11203 !E->UserTreeIndices.empty())) {
11204 const EdgeInfo &EI =
11205 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11206 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11208 if (EI.UserTE->getOpcode() != Instruction::Select ||
11210 auto UserBWIt = MinBWs.
find(EI.UserTE);
11211 Type *UserScalarTy =
11212 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11213 if (UserBWIt != MinBWs.
end())
11215 UserBWIt->second.first);
11216 if (ScalarTy != UserScalarTy) {
11217 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11218 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11219 unsigned VecOpcode;
11220 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11221 if (BWSz > SrcBWSz)
11222 VecOpcode = Instruction::Trunc;
11225 It->second.second ? Instruction::SExt : Instruction::ZExt;
11232 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11233 ScalarCost,
"Calculated costs for Tree"));
11234 return VecCost - ScalarCost;
11239 assert((E->State == TreeEntry::Vectorize ||
11240 E->State == TreeEntry::StridedVectorize) &&
11241 "Entry state expected to be Vectorize or StridedVectorize here.");
11245 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11246 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11247 "Calculated GEPs cost for Tree"));
11249 return VecCost - ScalarCost;
11256 Type *CanonicalType = Ty;
11263 {CanonicalType, CanonicalType});
11268 if (VI && SelectOnly) {
11270 "Expected only for scalar type.");
11271 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11273 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11274 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11275 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11277 return IntrinsicCost;
11279 switch (ShuffleOrOp) {
11280 case Instruction::PHI: {
11284 for (
Value *V : UniqueValues) {
11285 auto *
PHI = dyn_cast<PHINode>(V);
11290 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11294 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11296 if (!OpTE->ReuseShuffleIndices.empty())
11297 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11298 OpTE->Scalars.size());
11301 return CommonCost - ScalarCost;
11303 case Instruction::ExtractValue:
11304 case Instruction::ExtractElement: {
11305 auto GetScalarCost = [&](
unsigned Idx) {
11306 if (isa<PoisonValue>(UniqueValues[
Idx]))
11309 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11311 if (ShuffleOrOp == Instruction::ExtractElement) {
11312 auto *EE = cast<ExtractElementInst>(
I);
11313 SrcVecTy = EE->getVectorOperandType();
11315 auto *EV = cast<ExtractValueInst>(
I);
11316 Type *AggregateTy = EV->getAggregateOperand()->getType();
11318 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11319 NumElts = ATy->getNumElements();
11324 if (
I->hasOneUse()) {
11326 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11327 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11334 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11342 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11343 return GetCostDiff(GetScalarCost, GetVectorCost);
11345 case Instruction::InsertElement: {
11346 assert(E->ReuseShuffleIndices.empty() &&
11347 "Unique insertelements only are expected.");
11348 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11349 unsigned const NumElts = SrcVecTy->getNumElements();
11350 unsigned const NumScalars = VL.
size();
11356 unsigned OffsetEnd = OffsetBeg;
11357 InsertMask[OffsetBeg] = 0;
11360 if (OffsetBeg >
Idx)
11362 else if (OffsetEnd <
Idx)
11364 InsertMask[
Idx] =
I + 1;
11367 if (NumOfParts > 0 && NumOfParts < NumElts)
11368 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11369 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11371 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11372 unsigned InsertVecSz = std::min<unsigned>(
11374 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11375 bool IsWholeSubvector =
11376 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11380 if (OffsetBeg + InsertVecSz > VecSz) {
11383 InsertVecSz = VecSz;
11389 if (!E->ReorderIndices.empty()) {
11394 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11396 bool IsIdentity =
true;
11398 Mask.swap(PrevMask);
11399 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11401 DemandedElts.
setBit(InsertIdx);
11402 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11403 Mask[InsertIdx - OffsetBeg] =
I;
11405 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11419 InsertVecTy, Mask);
11420 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11421 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11429 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11430 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11431 if (InsertVecSz != VecSz) {
11442 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11451 case Instruction::ZExt:
11452 case Instruction::SExt:
11453 case Instruction::FPToUI:
11454 case Instruction::FPToSI:
11455 case Instruction::FPExt:
11456 case Instruction::PtrToInt:
11457 case Instruction::IntToPtr:
11458 case Instruction::SIToFP:
11459 case Instruction::UIToFP:
11460 case Instruction::Trunc:
11461 case Instruction::FPTrunc:
11462 case Instruction::BitCast: {
11463 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11466 unsigned Opcode = ShuffleOrOp;
11467 unsigned VecOpcode = Opcode;
11469 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11471 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11472 if (SrcIt != MinBWs.
end()) {
11473 SrcBWSz = SrcIt->second.first;
11480 if (BWSz == SrcBWSz) {
11481 VecOpcode = Instruction::BitCast;
11482 }
else if (BWSz < SrcBWSz) {
11483 VecOpcode = Instruction::Trunc;
11484 }
else if (It != MinBWs.
end()) {
11485 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11486 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11487 }
else if (SrcIt != MinBWs.
end()) {
11488 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11490 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11492 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11493 !SrcIt->second.second) {
11494 VecOpcode = Instruction::UIToFP;
11497 assert(
Idx == 0 &&
"Expected 0 index only");
11505 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11507 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11510 bool IsArithmeticExtendedReduction =
11511 E->Idx == 0 && UserIgnoreList &&
11513 auto *
I = cast<Instruction>(V);
11514 return is_contained({Instruction::Add, Instruction::FAdd,
11515 Instruction::Mul, Instruction::FMul,
11516 Instruction::And, Instruction::Or,
11520 if (IsArithmeticExtendedReduction &&
11521 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11523 return CommonCost +
11525 VecOpcode == Opcode ? VI :
nullptr);
11527 return GetCostDiff(GetScalarCost, GetVectorCost);
11529 case Instruction::FCmp:
11530 case Instruction::ICmp:
11531 case Instruction::Select: {
11535 match(VL0, MatchCmp))
11541 auto GetScalarCost = [&](
unsigned Idx) {
11542 if (isa<PoisonValue>(UniqueValues[
Idx]))
11545 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11551 !
match(VI, MatchCmp)) ||
11559 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11560 CostKind, getOperandInfo(
VI->getOperand(0)),
11561 getOperandInfo(
VI->getOperand(1)), VI);
11564 ScalarCost = IntrinsicCost;
11573 CostKind, getOperandInfo(E->getOperand(0)),
11574 getOperandInfo(E->getOperand(1)), VL0);
11575 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11578 unsigned CondNumElements = CondType->getNumElements();
11580 assert(VecTyNumElements >= CondNumElements &&
11581 VecTyNumElements % CondNumElements == 0 &&
11582 "Cannot vectorize Instruction::Select");
11583 if (CondNumElements != VecTyNumElements) {
11592 return VecCost + CommonCost;
11594 return GetCostDiff(GetScalarCost, GetVectorCost);
11596 case TreeEntry::MinMax: {
11597 auto GetScalarCost = [&](
unsigned Idx) {
11598 return GetMinMaxCost(OrigScalarTy);
11602 return VecCost + CommonCost;
11604 return GetCostDiff(GetScalarCost, GetVectorCost);
11606 case Instruction::FNeg:
11607 case Instruction::Add:
11608 case Instruction::FAdd:
11609 case Instruction::Sub:
11610 case Instruction::FSub:
11611 case Instruction::Mul:
11612 case Instruction::FMul:
11613 case Instruction::UDiv:
11614 case Instruction::SDiv:
11615 case Instruction::FDiv:
11616 case Instruction::URem:
11617 case Instruction::SRem:
11618 case Instruction::FRem:
11619 case Instruction::Shl:
11620 case Instruction::LShr:
11621 case Instruction::AShr:
11622 case Instruction::And:
11623 case Instruction::Or:
11624 case Instruction::Xor: {
11625 auto GetScalarCost = [&](
unsigned Idx) {
11626 if (isa<PoisonValue>(UniqueValues[
Idx]))
11629 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11630 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11639 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11640 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11643 auto *CI = dyn_cast<ConstantInt>(
Op);
11644 return CI && CI->getValue().countr_one() >= It->second.first;
11649 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11653 Op2Info, {},
nullptr, TLI) +
11656 return GetCostDiff(GetScalarCost, GetVectorCost);
11658 case Instruction::GetElementPtr: {
11659 return CommonCost + GetGEPCostDiff(VL, VL0);
11661 case Instruction::Load: {
11662 auto GetScalarCost = [&](
unsigned Idx) {
11663 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11665 VI->getAlign(),
VI->getPointerAddressSpace(),
11668 auto *LI0 = cast<LoadInst>(VL0);
11671 switch (E->State) {
11672 case TreeEntry::Vectorize:
11673 if (
unsigned Factor = E->getInterleaveFactor()) {
11675 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11676 LI0->getPointerAddressSpace(),
CostKind);
11680 Instruction::Load, VecTy, LI0->getAlign(),
11684 case TreeEntry::StridedVectorize: {
11685 Align CommonAlignment =
11686 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11688 Instruction::Load, VecTy, LI0->getPointerOperand(),
11689 false, CommonAlignment,
CostKind);
11692 case TreeEntry::ScatterVectorize: {
11693 Align CommonAlignment =
11694 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11696 Instruction::Load, VecTy, LI0->getPointerOperand(),
11697 false, CommonAlignment,
CostKind);
11700 case TreeEntry::CombinedVectorize:
11701 case TreeEntry::NeedToGather:
11704 return VecLdCost + CommonCost;
11710 if (E->State == TreeEntry::ScatterVectorize)
11716 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11717 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11719 case Instruction::Store: {
11720 bool IsReorder = !E->ReorderIndices.empty();
11721 auto GetScalarCost = [=](
unsigned Idx) {
11722 auto *
VI = cast<StoreInst>(VL[
Idx]);
11725 VI->getAlign(),
VI->getPointerAddressSpace(),
11729 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11733 if (E->State == TreeEntry::StridedVectorize) {
11734 Align CommonAlignment =
11735 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11737 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11738 false, CommonAlignment,
CostKind);
11740 assert(E->State == TreeEntry::Vectorize &&
11741 "Expected either strided or consecutive stores.");
11742 if (
unsigned Factor = E->getInterleaveFactor()) {
11743 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11744 "No reused shuffles expected");
11747 Instruction::Store, VecTy, Factor, std::nullopt,
11748 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11752 Instruction::Store, VecTy, BaseSI->getAlign(),
11753 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11756 return VecStCost + CommonCost;
11760 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11761 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11764 return GetCostDiff(GetScalarCost, GetVectorCost) +
11765 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11767 case Instruction::Call: {
11768 auto GetScalarCost = [&](
unsigned Idx) {
11769 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11780 auto *CI = cast<CallInst>(VL0);
11784 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11786 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11788 return GetCostDiff(GetScalarCost, GetVectorCost);
11790 case Instruction::ShuffleVector: {
11791 if (!
SLPReVec || E->isAltShuffle())
11792 assert(E->isAltShuffle() &&
11797 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11798 "Invalid Shuffle Vector Operand");
11801 auto TryFindNodeWithEqualOperands = [=]() {
11802 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11805 if (
TE->hasState() &&
TE->isAltShuffle() &&
11806 ((
TE->getOpcode() == E->getOpcode() &&
11807 TE->getAltOpcode() == E->getAltOpcode()) ||
11808 (
TE->getOpcode() == E->getAltOpcode() &&
11809 TE->getAltOpcode() == E->getOpcode())) &&
11810 TE->hasEqualOperands(*E))
11815 auto GetScalarCost = [&](
unsigned Idx) {
11816 if (isa<PoisonValue>(UniqueValues[
Idx]))
11819 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11820 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11830 if (TryFindNodeWithEqualOperands()) {
11832 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11839 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11841 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11842 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11844 VecCost = TTIRef.getCmpSelInstrCost(
11845 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11846 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11848 VecCost += TTIRef.getCmpSelInstrCost(
11849 E->getOpcode(), VecTy, MaskTy,
11850 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11851 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11854 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11857 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11858 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11860 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11861 if (SrcIt != MinBWs.
end()) {
11862 SrcBWSz = SrcIt->second.first;
11866 if (BWSz <= SrcBWSz) {
11867 if (BWSz < SrcBWSz)
11869 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11873 <<
"SLP: alternate extension, which should be truncated.\n";
11879 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11882 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11886 E->buildAltOpShuffleMask(
11888 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11899 unsigned Opcode0 = E->getOpcode();
11900 unsigned Opcode1 = E->getAltOpcode();
11904 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11906 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11907 return AltVecCost < VecCost ? AltVecCost : VecCost;
11912 if (
SLPReVec && !E->isAltShuffle())
11913 return GetCostDiff(
11918 "Not supported shufflevector usage.");
11919 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11920 unsigned SVNumElements =
11921 cast<FixedVectorType>(SV->getOperand(0)->getType())
11922 ->getNumElements();
11923 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11924 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11928 assert(isa<ShuffleVectorInst>(V) &&
11929 "Not supported shufflevector usage.");
11930 auto *SV = cast<ShuffleVectorInst>(V);
11932 [[maybe_unused]]
bool IsExtractSubvectorMask =
11933 SV->isExtractSubvectorMask(Index);
11934 assert(IsExtractSubvectorMask &&
11935 "Not supported shufflevector usage.");
11936 if (NextIndex != Index)
11938 NextIndex += SV->getShuffleMask().size();
11941 return ::getShuffleCost(
11947 return GetCostDiff(GetScalarCost, GetVectorCost);
11949 case Instruction::Freeze:
11956bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11958 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11960 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11962 return TE->isGather() &&
11964 [
this](
Value *V) { return EphValues.contains(V); }) &&
11966 TE->Scalars.size() < Limit ||
11967 (((
TE->hasState() &&
11968 TE->getOpcode() == Instruction::ExtractElement) ||
11969 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11971 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
11972 !
TE->isAltShuffle()) ||
11973 any_of(
TE->Scalars, IsaPred<LoadInst>));
11977 if (VectorizableTree.size() == 1 &&
11978 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11979 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11981 AreVectorizableGathers(VectorizableTree[0].
get(),
11982 VectorizableTree[0]->Scalars.size()) &&
11983 VectorizableTree[0]->getVectorFactor() > 2)))
11986 if (VectorizableTree.size() != 2)
11994 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11995 AreVectorizableGathers(VectorizableTree[1].
get(),
11996 VectorizableTree[0]->Scalars.size()))
12000 if (VectorizableTree[0]->
isGather() ||
12001 (VectorizableTree[1]->isGather() &&
12002 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12003 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12011 bool MustMatchOrInst) {
12015 Value *ZextLoad = Root;
12016 const APInt *ShAmtC;
12017 bool FoundOr =
false;
12018 while (!isa<ConstantExpr>(ZextLoad) &&
12021 ShAmtC->
urem(8) == 0))) {
12022 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12023 ZextLoad = BinOp->getOperand(0);
12024 if (BinOp->getOpcode() == Instruction::Or)
12029 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12036 Type *SrcTy = Load->getType();
12043 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
12044 << *(cast<Instruction>(Root)) <<
"\n");
12053 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12054 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12062 unsigned NumElts = Stores.
size();
12063 for (
Value *Scalar : Stores) {
12077 if (VectorizableTree.empty()) {
12078 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12084 if (VectorizableTree.size() == 2 &&
12085 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12086 VectorizableTree[1]->isGather() &&
12087 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12088 !(
isSplat(VectorizableTree[1]->Scalars) ||
12096 constexpr int Limit = 4;
12098 !VectorizableTree.empty() &&
12099 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12100 return (TE->isGather() &&
12101 (!TE->hasState() ||
12102 TE->getOpcode() != Instruction::ExtractElement) &&
12103 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12104 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12115 if (isFullyVectorizableTinyTree(ForReduction))
12120 bool IsAllowedSingleBVNode =
12121 VectorizableTree.size() > 1 ||
12122 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12123 !VectorizableTree.front()->isAltShuffle() &&
12124 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12125 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12127 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12128 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12129 return isa<ExtractElementInst, UndefValue>(V) ||
12130 (IsAllowedSingleBVNode &&
12131 !V->hasNUsesOrMore(UsesLimit) &&
12132 any_of(V->users(), IsaPred<InsertElementInst>));
12137 if (VectorizableTree.back()->isGather() &&
12138 VectorizableTree.back()->hasState() &&
12139 VectorizableTree.back()->isAltShuffle() &&
12140 VectorizableTree.back()->getVectorFactor() > 2 &&
12142 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12144 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12145 VectorizableTree.back()->getVectorFactor()),
12158 constexpr unsigned SmallTree = 3;
12159 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12162 [](
const std::unique_ptr<TreeEntry> &TE) {
12163 return TE->isGather() && TE->hasState() &&
12164 TE->getOpcode() == Instruction::Load &&
12172 TreeEntry &E = *VectorizableTree[
Idx];
12175 if (E.hasState() && E.getOpcode() != Instruction::Load)
12189 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12202 for (
const auto &TEPtr : VectorizableTree) {
12203 if (TEPtr->State != TreeEntry::Vectorize)
12205 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12211 auto *NodeA = DT->
getNode(
A->getParent());
12212 auto *NodeB = DT->
getNode(
B->getParent());
12213 assert(NodeA &&
"Should only process reachable instructions");
12214 assert(NodeB &&
"Should only process reachable instructions");
12215 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12216 "Different nodes should have different DFS numbers");
12217 if (NodeA != NodeB)
12218 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12219 return B->comesBefore(
A);
12229 LiveValues.
erase(PrevInst);
12230 for (
auto &J : PrevInst->
operands()) {
12231 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12232 LiveValues.
insert(cast<Instruction>(&*J));
12236 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12237 for (
auto *
X : LiveValues)
12238 dbgs() <<
" " <<
X->getName();
12239 dbgs() <<
", Looking at ";
12244 unsigned NumCalls = 0;
12248 while (InstIt != PrevInstIt) {
12249 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12250 PrevInstIt = Inst->getParent()->rbegin();
12255 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12256 if (
II->isAssumeLikeIntrinsic())
12264 if (IntrCost < CallCost)
12271 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12272 &*PrevInstIt != PrevInst)
12280 for (
auto *
II : LiveValues) {
12281 auto *ScalarTy =
II->getType();
12282 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12283 ScalarTy = VectorTy->getElementType();
12301 const auto *I1 = IE1;
12302 const auto *I2 = IE2;
12314 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12316 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12317 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12319 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12320 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12327struct ValueSelect {
12328 template <
typename U>
12329 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12332 template <
typename U>
12333 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12351template <
typename T>
12357 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12359 auto VMIt = std::next(ShuffleMask.begin());
12362 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12364 if (!IsBaseUndef.
all()) {
12366 std::pair<T *, bool> Res =
12367 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12369 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12373 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12375 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
12376 assert((!V || GetVF(V) == Mask.size()) &&
12377 "Expected base vector of VF number of elements.");
12378 Prev = Action(Mask, {
nullptr, Res.first});
12379 }
else if (ShuffleMask.size() == 1) {
12382 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12388 Prev = Action(Mask, {ShuffleMask.begin()->first});
12392 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12393 unsigned Vec2VF = GetVF(VMIt->first);
12394 if (Vec1VF == Vec2VF) {
12398 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12401 Mask[
I] = SecMask[
I] + Vec1VF;
12404 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12407 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12409 std::pair<T *, bool> Res2 =
12410 ResizeAction(VMIt->first, VMIt->second,
false);
12412 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12419 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12422 Prev = Action(Mask, {Res1.first, Res2.first});
12424 VMIt = std::next(VMIt);
12426 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
12428 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12430 std::pair<T *, bool> Res =
12431 ResizeAction(VMIt->first, VMIt->second,
false);
12433 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12436 "Multiple uses of scalars.");
12437 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12442 Prev = Action(Mask, {Prev, Res.first});
12450template <
typename T>
struct ShuffledInsertData {
12461 << VectorizableTree.size() <<
".\n");
12463 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12466 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12467 TreeEntry &TE = *VectorizableTree[
I];
12470 if (TE.State == TreeEntry::CombinedVectorize) {
12472 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12473 << *TE.Scalars[0] <<
".\n";
12474 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12477 if (TE.isGather() && TE.hasState()) {
12478 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12479 E && E->getVectorFactor() == TE.getVectorFactor() &&
12480 E->isSame(TE.Scalars)) {
12485 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12492 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12493 "Expected gather nodes with users only.");
12499 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12508 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12515 for (ExternalUser &EU : ExternalUses) {
12516 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12518 for (ExternalUser &EU : ExternalUses) {
12522 if (EphValues.
count(EU.User))
12528 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12531 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12535 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12536 !ExtractCostCalculated.
insert(EU.Scalar).second)
12540 if (isa<FixedVectorType>(EU.Scalar->getType()))
12545 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12547 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12548 if (!UsedInserts.
insert(VU).second)
12552 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12555 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12560 Value *Op0 =
II->getOperand(0);
12561 if (getTreeEntry(
II) && !getTreeEntry(Op0))
12567 if (It == ShuffledInserts.
end()) {
12569 Data.InsertElements.emplace_back(VU);
12571 VecId = ShuffledInserts.
size() - 1;
12572 auto It = MinBWs.
find(ScalarTE);
12573 if (It != MinBWs.
end() &&
12575 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12577 unsigned BWSz = It->second.first;
12578 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12579 unsigned VecOpcode;
12580 if (DstBWSz < BWSz)
12581 VecOpcode = Instruction::Trunc;
12584 It->second.second ? Instruction::SExt : Instruction::ZExt;
12589 FTy->getNumElements()),
12592 <<
" for extending externally used vector with "
12593 "non-equal minimum bitwidth.\n");
12598 It->InsertElements.front() = VU;
12599 VecId = std::distance(ShuffledInserts.
begin(), It);
12601 int InIdx = *InsertIdx;
12603 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12606 Mask[InIdx] = EU.Lane;
12607 DemandedElts[VecId].setBit(InIdx);
12618 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12619 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12620 auto It = MinBWs.
find(Entry);
12621 if (It != MinBWs.
end()) {
12624 ? Instruction::ZExt
12625 : Instruction::SExt;
12632 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12635 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12636 Entry->getOpcode() == Instruction::Load) {
12638 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12639 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12640 auto *
I = cast<Instruction>(U.Scalar);
12641 const Loop *L = LI->getLoopFor(Phi->getParent());
12642 return L && (Phi->getParent() ==
I->getParent() ||
12643 L == LI->getLoopFor(
I->getParent()));
12647 if (!ValueToExtUses) {
12648 ValueToExtUses.emplace();
12651 if (IsPhiInLoop(
P.value()))
12654 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12659 auto *Inst = cast<Instruction>(EU.Scalar);
12661 auto OperandIsScalar = [&](
Value *V) {
12662 if (!getTreeEntry(V)) {
12666 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12667 return !EE->hasOneUse() || !MustGather.contains(EE);
12670 return ValueToExtUses->contains(V);
12672 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12673 bool CanBeUsedAsScalarCast =
false;
12674 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12675 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12676 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12678 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12681 if (ScalarCost + OpCost <= ExtraCost) {
12682 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12683 ScalarCost += OpCost;
12687 if (CanBeUsedAsScalar) {
12688 bool KeepScalar = ScalarCost <= ExtraCost;
12692 bool IsProfitablePHIUser =
12694 VectorizableTree.front()->Scalars.size() > 2)) &&
12695 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12699 auto *PHIUser = dyn_cast<PHINode>(U);
12700 return (!PHIUser ||
12701 PHIUser->getParent() !=
12703 VectorizableTree.front()->getMainOp())
12708 return ValueToExtUses->contains(V);
12710 if (IsProfitablePHIUser) {
12714 (!GatheredLoadsEntriesFirst.has_value() ||
12715 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12716 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12717 return ValueToExtUses->contains(V);
12719 auto It = ExtractsCount.
find(Entry);
12720 if (It != ExtractsCount.
end()) {
12721 assert(ScalarUsesCount >= It->getSecond().size() &&
12722 "Expected total number of external uses not less than "
12723 "number of scalar uses.");
12724 ScalarUsesCount -= It->getSecond().size();
12729 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12732 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12734 auto It = ValueToExtUses->find(V);
12735 if (It != ValueToExtUses->end()) {
12737 ExternalUses[It->second].User = nullptr;
12740 ExtraCost = ScalarCost;
12741 if (!IsPhiInLoop(EU))
12742 ExtractsCount[Entry].
insert(Inst);
12743 if (CanBeUsedAsScalarCast) {
12744 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12747 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12749 auto It = ValueToExtUses->find(V);
12750 if (It != ValueToExtUses->end()) {
12752 ExternalUses[It->second].User = nullptr;
12761 ExtractCost += ExtraCost;
12765 for (
Value *V : ScalarOpsFromCasts) {
12766 ExternalUsesAsOriginalScalar.
insert(V);
12767 if (
const TreeEntry *E = getTreeEntry(V)) {
12768 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12772 if (!VectorizedVals.
empty()) {
12773 const TreeEntry &Root = *VectorizableTree.front();
12774 auto BWIt = MinBWs.find(&Root);
12775 if (BWIt != MinBWs.end()) {
12776 Type *DstTy = Root.Scalars.front()->getType();
12779 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12780 if (OriginalSz != SrcSz) {
12781 unsigned Opcode = Instruction::Trunc;
12782 if (OriginalSz > SrcSz)
12783 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12785 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12797 Cost += SpillCost + ExtractCost;
12801 unsigned VF =
Mask.size();
12802 unsigned VecVF =
TE->getVectorFactor();
12804 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12807 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12813 dbgs() <<
"SLP: Adding cost " <<
C
12814 <<
" for final shuffle of insertelement external users.\n";
12815 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12817 return std::make_pair(TE,
true);
12819 return std::make_pair(TE,
false);
12822 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12823 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12824 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12828 assert((TEs.size() == 1 || TEs.size() == 2) &&
12829 "Expected exactly 1 or 2 tree entries.");
12830 if (TEs.size() == 1) {
12832 VF = TEs.front()->getVectorFactor();
12833 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12837 (
Data.index() < VF &&
12838 static_cast<int>(
Data.index()) ==
Data.value());
12843 <<
" for final shuffle of insertelement "
12844 "external users.\n";
12845 TEs.front()->
dump();
12846 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12852 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12853 VF = TEs.front()->getVectorFactor();
12857 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12861 <<
" for final shuffle of vector node and external "
12862 "insertelement users.\n";
12863 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12864 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12870 (void)performExtractsShuffleAction<const TreeEntry>(
12872 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12873 EstimateShufflesCost);
12875 cast<FixedVectorType>(
12876 ShuffledInserts[
I].InsertElements.front()->getType()),
12879 Cost -= InsertCost;
12883 if (ReductionBitWidth != 0) {
12884 assert(UserIgnoreList &&
"Expected reduction tree.");
12885 const TreeEntry &E = *VectorizableTree.front();
12886 auto It = MinBWs.find(&E);
12887 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12888 unsigned SrcSize = It->second.first;
12889 unsigned DstSize = ReductionBitWidth;
12890 unsigned Opcode = Instruction::Trunc;
12891 if (SrcSize < DstSize) {
12892 bool IsArithmeticExtendedReduction =
12894 auto *
I = cast<Instruction>(V);
12895 return is_contained({Instruction::Add, Instruction::FAdd,
12896 Instruction::Mul, Instruction::FMul,
12897 Instruction::And, Instruction::Or,
12901 if (IsArithmeticExtendedReduction)
12903 Instruction::BitCast;
12905 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12907 if (Opcode != Instruction::BitCast) {
12909 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12911 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12914 switch (E.getOpcode()) {
12915 case Instruction::SExt:
12916 case Instruction::ZExt:
12917 case Instruction::Trunc: {
12918 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12919 CCH = getCastContextHint(*OpTE);
12929 <<
" for final resize for reduction from " << SrcVecTy
12930 <<
" to " << DstVecTy <<
"\n";
12931 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12940 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12941 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12942 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12946 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12957std::optional<TTI::ShuffleKind>
12958BoUpSLP::tryToGatherSingleRegisterExtractElements(
12964 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12965 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12967 if (isa<UndefValue>(VL[
I]))
12971 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12972 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12985 ExtractMask.reset(*
Idx);
12990 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12995 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12996 return P1.second.size() > P2.second.size();
12999 const int UndefSz = UndefVectorExtracts.
size();
13000 unsigned SingleMax = 0;
13001 unsigned PairMax = 0;
13002 if (!Vectors.
empty()) {
13003 SingleMax = Vectors.
front().second.size() + UndefSz;
13004 if (Vectors.
size() > 1) {
13005 auto *ItNext = std::next(Vectors.
begin());
13006 PairMax = SingleMax + ItNext->second.size();
13009 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13010 return std::nullopt;
13016 if (SingleMax >= PairMax && SingleMax) {
13017 for (
int Idx : Vectors.
front().second)
13019 }
else if (!Vectors.
empty()) {
13020 for (
unsigned Idx : {0, 1})
13021 for (
int Idx : Vectors[
Idx].second)
13025 for (
int Idx : UndefVectorExtracts)
13029 std::optional<TTI::ShuffleKind> Res =
13035 return std::nullopt;
13039 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13040 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
13041 isa<UndefValue>(GatheredExtracts[
I])) {
13045 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13046 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13047 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13062 unsigned NumParts)
const {
13063 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13067 for (
unsigned Part : seq<unsigned>(NumParts)) {
13073 std::optional<TTI::ShuffleKind> Res =
13074 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13075 ShufflesRes[Part] = Res;
13076 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13078 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13079 return Res.has_value();
13081 ShufflesRes.clear();
13082 return ShufflesRes;
13085std::optional<TargetTransformInfo::ShuffleKind>
13086BoUpSLP::isGatherShuffledSingleRegisterEntry(
13092 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13093 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13094 :
TE->UserTreeIndices.front();
13095 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13099 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13100 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13103 TEInsertBlock = TEInsertPt->
getParent();
13106 return std::nullopt;
13107 auto *NodeUI = DT->
getNode(TEInsertBlock);
13108 assert(NodeUI &&
"Should only process reachable instructions");
13110 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13124 auto *NodeEUI = DT->
getNode(InsertBlock);
13127 assert((NodeUI == NodeEUI) ==
13128 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13129 "Different nodes should have different DFS numbers");
13131 if (TEInsertPt->
getParent() != InsertBlock &&
13134 if (TEInsertPt->
getParent() == InsertBlock &&
13148 for (
Value *V : VL) {
13153 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13154 if (TEPtr == TE || TEPtr->Idx == 0)
13157 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13158 "Must contain at least single gathered value.");
13159 assert(TEPtr->UserTreeIndices.size() == 1 &&
13160 "Expected only single user of a gather node.");
13161 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13163 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13166 : &getLastInstructionInBundle(UseEI.UserTE);
13167 if (TEInsertPt == InsertPt) {
13171 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13175 if (TEUseEI.UserTE != UseEI.UserTE &&
13176 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13182 if ((TEInsertBlock != InsertPt->
getParent() ||
13183 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13184 !CheckOrdering(InsertPt))
13188 if (
const TreeEntry *VTE = getTreeEntry(V)) {
13189 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13190 if (VTE->State != TreeEntry::Vectorize) {
13191 auto It = MultiNodeScalars.
find(V);
13192 if (It == MultiNodeScalars.
end())
13194 VTE = *It->getSecond().begin();
13196 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13197 return MTE->State == TreeEntry::Vectorize;
13199 if (MIt == It->getSecond().end())
13204 if (
none_of(
TE->CombinedEntriesWithIndices,
13205 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
13206 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13207 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13212 if (VToTEs.
empty())
13214 if (UsedTEs.
empty()) {
13228 if (!VToTEs.
empty()) {
13234 VToTEs = SavedVToTEs;
13243 if (UsedTEs.
size() == 2)
13245 UsedTEs.push_back(SavedVToTEs);
13252 if (UsedTEs.
empty()) {
13254 return std::nullopt;
13258 if (UsedTEs.
size() == 1) {
13261 UsedTEs.front().
end());
13262 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13263 return TE1->Idx < TE2->Idx;
13266 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13267 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13269 if (It != FirstEntries.end() &&
13270 ((*It)->getVectorFactor() == VL.size() ||
13271 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13272 TE->ReuseShuffleIndices.size() == VL.size() &&
13273 (*It)->isSame(
TE->Scalars)))) {
13274 Entries.push_back(*It);
13275 if ((*It)->getVectorFactor() == VL.size()) {
13276 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13277 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13283 for (
unsigned I : seq<unsigned>(VL.size()))
13284 if (isa<PoisonValue>(VL[
I]))
13290 Entries.push_back(FirstEntries.front());
13291 VF = FirstEntries.front()->getVectorFactor();
13294 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13297 for (
const TreeEntry *TE : UsedTEs.front()) {
13298 unsigned VF =
TE->getVectorFactor();
13299 auto It = VFToTE.
find(VF);
13300 if (It != VFToTE.
end()) {
13301 if (It->second->Idx >
TE->Idx)
13302 It->getSecond() =
TE;
13309 UsedTEs.back().
end());
13310 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13311 return TE1->Idx < TE2->Idx;
13313 for (
const TreeEntry *TE : SecondEntries) {
13314 auto It = VFToTE.
find(
TE->getVectorFactor());
13315 if (It != VFToTE.
end()) {
13317 Entries.push_back(It->second);
13318 Entries.push_back(TE);
13324 if (Entries.empty()) {
13326 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13327 return TE1->Idx < TE2->Idx;
13329 Entries.push_back(SecondEntries.front());
13330 VF = std::max(Entries.front()->getVectorFactor(),
13331 Entries.back()->getVectorFactor());
13333 VF = Entries.front()->getVectorFactor();
13337 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13340 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13341 auto *
PHI = cast<PHINode>(V);
13342 auto *PHI1 = cast<PHINode>(V1);
13347 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13349 Value *In1 = PHI1->getIncomingValue(
I);
13354 if (cast<Instruction>(In)->
getParent() !=
13364 auto MightBeIgnored = [=](
Value *
V) {
13365 auto *
I = dyn_cast<Instruction>(V);
13366 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13368 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13373 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13375 bool UsedInSameVTE =
false;
13376 auto It = UsedValuesEntry.
find(V1);
13377 if (It != UsedValuesEntry.
end())
13378 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13379 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13381 cast<Instruction>(V)->getParent() ==
13382 cast<Instruction>(V1)->getParent() &&
13383 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13388 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13390 auto It = UsedValuesEntry.
find(V);
13391 if (It == UsedValuesEntry.
end())
13397 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13398 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13400 unsigned Idx = It->second;
13407 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13408 if (!UsedIdxs.test(
I))
13414 for (std::pair<unsigned, int> &Pair : EntryLanes)
13415 if (Pair.first ==
I)
13416 Pair.first = TempEntries.
size();
13419 Entries.swap(TempEntries);
13420 if (EntryLanes.size() == Entries.size() &&
13422 .
slice(Part * VL.size(),
13423 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13429 return std::nullopt;
13432 bool IsIdentity = Entries.size() == 1;
13435 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13436 unsigned Idx = Part * VL.size() + Pair.second;
13439 (ForOrder ? std::distance(
13440 Entries[Pair.first]->Scalars.begin(),
13441 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13442 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13443 IsIdentity &=
Mask[
Idx] == Pair.second;
13445 if (ForOrder || IsIdentity || Entries.empty()) {
13446 switch (Entries.size()) {
13448 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13452 if (EntryLanes.size() > 2 || VL.size() <= 2)
13458 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13459 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13462 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13463 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13464 for (
int Idx : SubMask) {
13472 assert(MaxElement >= 0 && MinElement >= 0 &&
13473 MaxElement % VF >= MinElement % VF &&
13474 "Expected at least single element.");
13475 unsigned NewVF = std::max<unsigned>(
13477 (MaxElement % VF) -
13478 (MinElement % VF) + 1));
13483 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13484 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13492 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13493 auto GetShuffleCost = [&,
13497 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13499 Mask, Entries.front()->getInterleaveFactor()))
13501 return ::getShuffleCost(
TTI,
13506 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13509 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13510 FirstShuffleCost = ShuffleCost;
13514 bool IsIdentity =
true;
13516 if (
Idx >=
static_cast<int>(NewVF)) {
13521 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13525 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13527 MaskVecTy, DemandedElts,
true,
13532 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13533 SecondShuffleCost = ShuffleCost;
13537 bool IsIdentity =
true;
13539 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13545 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13550 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13552 MaskVecTy, DemandedElts,
true,
13562 const TreeEntry *BestEntry =
nullptr;
13563 if (FirstShuffleCost < ShuffleCost) {
13564 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13565 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13567 if (Idx >= static_cast<int>(VF))
13568 Idx = PoisonMaskElem;
13570 BestEntry = Entries.front();
13571 ShuffleCost = FirstShuffleCost;
13573 if (SecondShuffleCost < ShuffleCost) {
13574 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13575 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13577 if (Idx < static_cast<int>(VF))
13578 Idx = PoisonMaskElem;
13582 BestEntry = Entries[1];
13583 ShuffleCost = SecondShuffleCost;
13585 if (BuildVectorCost >= ShuffleCost) {
13588 Entries.push_back(BestEntry);
13596 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13598 return std::nullopt;
13602BoUpSLP::isGatherShuffledEntry(
13606 assert(NumParts > 0 && NumParts < VL.
size() &&
13607 "Expected positive number of registers.");
13610 if (TE == VectorizableTree.front().get() &&
13611 (!GatheredLoadsEntriesFirst.has_value() ||
13613 [](
const std::unique_ptr<TreeEntry> &TE) {
13614 return !
TE->isGather();
13619 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
13622 assert((
TE->UserTreeIndices.size() == 1 ||
13623 TE == VectorizableTree.front().get()) &&
13624 "Expected only single user of the gather node.");
13626 "Number of scalars must be divisible by NumParts.");
13627 if (!
TE->UserTreeIndices.empty() &&
13628 TE->UserTreeIndices.front().UserTE->isGather() &&
13629 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13632 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
13634 "Expected splat or extractelements only node.");
13639 for (
unsigned Part : seq<unsigned>(NumParts)) {
13643 std::optional<TTI::ShuffleKind> SubRes =
13644 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13647 SubEntries.
clear();
13650 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13651 (SubEntries.
front()->isSame(
TE->Scalars) ||
13652 SubEntries.
front()->isSame(VL))) {
13654 LocalSubEntries.
swap(SubEntries);
13657 std::iota(
Mask.begin(),
Mask.end(), 0);
13659 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13660 if (isa<PoisonValue>(VL[
I]))
13662 Entries.emplace_back(1, LocalSubEntries.
front());
13668 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13676 Type *ScalarTy)
const {
13678 bool DuplicateNonConst =
false;
13686 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13687 if (
V->getType() != ScalarTy) {
13698 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13701 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13709 EstimateInsertCost(
I, V);
13710 ShuffleMask[
I] =
I;
13714 DuplicateNonConst =
true;
13716 ShuffleMask[
I] = Res.first->second;
13718 if (ForPoisonSrc) {
13719 if (isa<FixedVectorType>(ScalarTy)) {
13725 for (
unsigned I : seq<unsigned>(VL.
size()))
13726 if (!ShuffledElements[
I])
13729 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13737 if (DuplicateNonConst)
13739 VecTy, ShuffleMask);
13743Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13744 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13750 auto *Front = E->getMainOp();
13752 assert(((GatheredLoadsEntriesFirst.has_value() &&
13753 E->getOpcode() == Instruction::Load && E->isGather() &&
13754 E->Idx < *GatheredLoadsEntriesFirst) ||
13756 [=](
Value *V) ->
bool {
13757 if (E->getOpcode() == Instruction::GetElementPtr &&
13758 !isa<GetElementPtrInst>(V))
13760 auto *I = dyn_cast<Instruction>(V);
13761 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13762 isVectorLikeInstWithConstOps(I);
13764 "Expected gathered loads or GEPs or instructions from same basic "
13767 auto FindLastInst = [&]() {
13769 for (
Value *V : E->Scalars) {
13770 auto *
I = dyn_cast<Instruction>(V);
13773 if (LastInst->
getParent() ==
I->getParent()) {
13778 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13779 !isa<GetElementPtrInst>(
I)) ||
13782 (GatheredLoadsEntriesFirst.has_value() &&
13783 E->getOpcode() == Instruction::Load && E->isGather() &&
13784 E->Idx < *GatheredLoadsEntriesFirst)) &&
13785 "Expected vector-like or non-GEP in GEP node insts only.");
13793 auto *NodeB = DT->
getNode(
I->getParent());
13794 assert(NodeA &&
"Should only process reachable instructions");
13795 assert(NodeB &&
"Should only process reachable instructions");
13796 assert((NodeA == NodeB) ==
13797 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13798 "Different nodes should have different DFS numbers");
13799 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13806 auto FindFirstInst = [&]() {
13808 for (
Value *V : E->Scalars) {
13809 auto *
I = dyn_cast<Instruction>(V);
13812 if (FirstInst->
getParent() ==
I->getParent()) {
13813 if (
I->comesBefore(FirstInst))
13817 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13818 !isa<GetElementPtrInst>(
I)) ||
13821 "Expected vector-like or non-GEP in GEP node insts only.");
13829 auto *NodeB = DT->
getNode(
I->getParent());
13830 assert(NodeA &&
"Should only process reachable instructions");
13831 assert(NodeB &&
"Should only process reachable instructions");
13832 assert((NodeA == NodeB) ==
13833 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13834 "Different nodes should have different DFS numbers");
13835 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13842 if (GatheredLoadsEntriesFirst.has_value() &&
13843 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13844 E->getOpcode() == Instruction::Load) {
13845 Res = FindFirstInst();
13853 if ((E->getOpcode() == Instruction::GetElementPtr &&
13856 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13860 return isa<PoisonValue>(V) ||
13861 (!isVectorLikeInstWithConstOps(V) &&
13862 isUsedOutsideBlock(V));
13864 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13865 return isa<ExtractElementInst, UndefValue>(V) ||
13866 areAllOperandsNonInsts(V);
13868 Res = FindLastInst();
13870 Res = FindFirstInst();
13878 if (BlocksSchedules.count(BB) && !E->isGather()) {
13879 Value *
V = E->isOneOf(E->Scalars.back());
13882 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13883 if (Bundle && Bundle->isPartOfBundle())
13884 for (; Bundle; Bundle = Bundle->NextInBundle)
13885 Res = Bundle->Inst;
13907 Res = FindLastInst();
13908 assert(Res &&
"Failed to find last instruction in bundle");
13912void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13913 auto *Front = E->getMainOp();
13914 Instruction *LastInst = &getLastInstructionInBundle(E);
13915 assert(LastInst &&
"Failed to find last instruction in bundle");
13918 bool IsPHI = isa<PHINode>(LastInst);
13920 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13922 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13926 Builder.SetInsertPoint(
13930 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13933Value *BoUpSLP::gather(
13942 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13945 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13946 InsertBB = InsertBB->getSinglePredecessor();
13947 return InsertBB && InsertBB == InstBB;
13949 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13950 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13951 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13952 getTreeEntry(Inst) ||
13953 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13954 PostponedIndices.
insert(
I).second)
13958 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13961 if (
Scalar->getType() != Ty) {
13965 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13966 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13968 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13969 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13972 Scalar = Builder.CreateIntCast(
13977 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13981 auto *
II = dyn_cast<IntrinsicInst>(Vec);
13982 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13986 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13987 InsElt = dyn_cast<InsertElementInst>(Vec);
13991 GatherShuffleExtractSeq.
insert(InsElt);
13994 if (isa<Instruction>(V)) {
13995 if (TreeEntry *Entry = getTreeEntry(V)) {
13997 User *UserOp =
nullptr;
13999 if (
auto *SI = dyn_cast<Instruction>(Scalar))
14005 unsigned FoundLane =
Entry->findLaneForValue(V);
14006 ExternalUses.emplace_back(V, UserOp, FoundLane);
14016 std::iota(
Mask.begin(),
Mask.end(), 0);
14017 Value *OriginalRoot = Root;
14018 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14019 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14020 SV->getOperand(0)->getType() == VecTy) {
14021 Root = SV->getOperand(0);
14022 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14025 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
14032 if (isa<PoisonValue>(VL[
I]))
14034 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14038 if (isa<PoisonValue>(Vec)) {
14039 Vec = OriginalRoot;
14041 Vec = CreateShuffle(Root, Vec, Mask);
14042 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14043 OI && OI->hasNUses(0) &&
14044 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14045 return TE->VectorizedValue == OI;
14051 for (
int I : NonConsts)
14052 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14055 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14056 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14094 bool IsFinalized =
false;
14107 class ShuffleIRBuilder {
14120 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14121 CSEBlocks(CSEBlocks),
DL(
DL) {}
14122 ~ShuffleIRBuilder() =
default;
14125 if (V1->
getType() != V2->getType()) {
14128 "Expected integer vector types only.");
14129 if (V1->
getType() != V2->getType()) {
14130 if (cast<VectorType>(V2->getType())
14132 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14134 ->getIntegerBitWidth())
14143 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14144 GatherShuffleExtractSeq.
insert(
I);
14145 CSEBlocks.
insert(
I->getParent());
14154 unsigned VF = Mask.size();
14155 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14159 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14160 GatherShuffleExtractSeq.
insert(
I);
14161 CSEBlocks.
insert(
I->getParent());
14165 Value *createIdentity(
Value *V) {
return V; }
14166 Value *createPoison(
Type *Ty,
unsigned VF) {
14171 void resizeToMatch(
Value *&V1,
Value *&V2) {
14172 if (V1->
getType() == V2->getType())
14174 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14175 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14176 int VF = std::max(V1VF, V2VF);
14177 int MinVF = std::min(V1VF, V2VF);
14179 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14181 Value *&
Op = MinVF == V1VF ? V1 : V2;
14183 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14184 GatherShuffleExtractSeq.
insert(
I);
14185 CSEBlocks.
insert(
I->getParent());
14198 assert(V1 &&
"Expected at least one vector value.");
14199 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14200 R.CSEBlocks, *R.DL);
14201 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14208 std::optional<bool> IsSigned = std::nullopt) {
14209 auto *VecTy = cast<VectorType>(V->getType());
14220 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14224 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14225 unsigned NumParts,
bool &UseVecBaseAsInput) {
14226 UseVecBaseAsInput =
false;
14228 Value *VecBase =
nullptr;
14230 if (!E->ReorderIndices.empty()) {
14232 E->ReorderIndices.end());
14235 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14239 auto *EI = cast<ExtractElementInst>(VL[
I]);
14240 VecBase = EI->getVectorOperand();
14241 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14242 VecBase = TE->VectorizedValue;
14243 assert(VecBase &&
"Expected vectorized value.");
14244 UniqueBases.
insert(VecBase);
14247 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14248 (NumParts != 1 &&
count(VL, EI) > 1) ||
14250 const TreeEntry *UTE = R.getTreeEntry(U);
14251 return !UTE || R.MultiNodeScalars.contains(U) ||
14252 (isa<GetElementPtrInst>(U) &&
14253 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14254 count_if(R.VectorizableTree,
14255 [&](const std::unique_ptr<TreeEntry> &TE) {
14256 return any_of(TE->UserTreeIndices,
14257 [&](const EdgeInfo &Edge) {
14258 return Edge.UserTE == UTE;
14260 is_contained(VL, EI);
14264 R.eraseInstruction(EI);
14266 if (NumParts == 1 || UniqueBases.
size() == 1) {
14267 assert(VecBase &&
"Expected vectorized value.");
14268 return castToScalarTyElem(VecBase);
14270 UseVecBaseAsInput =
true;
14280 Value *Vec =
nullptr;
14283 for (
unsigned Part : seq<unsigned>(NumParts)) {
14287 constexpr int MaxBases = 2;
14289 auto VLMask =
zip(SubVL, SubMask);
14290 const unsigned VF = std::accumulate(
14291 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14292 if (std::get<1>(D) == PoisonMaskElem)
14295 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14296 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14297 VecOp = TE->VectorizedValue;
14298 assert(VecOp &&
"Expected vectorized value.");
14299 const unsigned Size =
14300 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14301 return std::max(S, Size);
14303 for (
const auto [V,
I] : VLMask) {
14306 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14307 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14308 VecOp = TE->VectorizedValue;
14309 assert(VecOp &&
"Expected vectorized value.");
14310 VecOp = castToScalarTyElem(VecOp);
14311 Bases[
I / VF] = VecOp;
14313 if (!Bases.front())
14316 if (Bases.back()) {
14317 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14318 TransformToIdentity(SubMask);
14320 SubVec = Bases.front();
14327 Mask.slice(
P * SliceSize,
14334 "Expected first part or all previous parts masked.");
14335 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14338 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14340 unsigned SubVecVF =
14341 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14342 NewVF = std::max(NewVF, SubVecVF);
14345 for (
int &
Idx : SubMask)
14348 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14349 Vec = createShuffle(Vec, SubVec, VecMask);
14350 TransformToIdentity(VecMask);
14358 std::optional<Value *>
14364 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14366 return std::nullopt;
14369 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14378 Value *V1 = E1.VectorizedValue;
14380 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14381 if (isa<PoisonValue>(V))
14383 return !isKnownNonNegative(
14384 V, SimplifyQuery(*R.DL));
14386 Value *V2 = E2.VectorizedValue;
14387 if (V2->getType()->isIntOrIntVectorTy())
14388 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14389 if (isa<PoisonValue>(V))
14391 return !isKnownNonNegative(
14392 V, SimplifyQuery(*R.DL));
14399 Value *V1 = E1.VectorizedValue;
14401 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14402 if (isa<PoisonValue>(V))
14404 return !isKnownNonNegative(
14405 V, SimplifyQuery(*R.DL));
14411 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14413 isa<FixedVectorType>(V2->getType()) &&
14414 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14415 V1 = castToScalarTyElem(V1);
14416 V2 = castToScalarTyElem(V2);
14417 if (InVectors.
empty()) {
14420 CommonMask.
assign(Mask.begin(), Mask.end());
14424 if (InVectors.
size() == 2) {
14425 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14426 transformMaskAfterShuffle(CommonMask, CommonMask);
14427 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14429 Vec = createShuffle(Vec,
nullptr, CommonMask);
14430 transformMaskAfterShuffle(CommonMask, CommonMask);
14432 V1 = createShuffle(V1, V2, Mask);
14433 unsigned VF = std::max(getVF(V1), getVF(Vec));
14434 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14436 CommonMask[
Idx] =
Idx + VF;
14437 InVectors.
front() = Vec;
14438 if (InVectors.
size() == 2)
14439 InVectors.
back() = V1;
14446 "castToScalarTyElem expects V1 to be FixedVectorType");
14447 V1 = castToScalarTyElem(V1);
14448 if (InVectors.
empty()) {
14450 CommonMask.
assign(Mask.begin(), Mask.end());
14453 const auto *It =
find(InVectors, V1);
14454 if (It == InVectors.
end()) {
14455 if (InVectors.
size() == 2 ||
14458 if (InVectors.
size() == 2) {
14459 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14460 transformMaskAfterShuffle(CommonMask, CommonMask);
14461 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14462 CommonMask.
size()) {
14463 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14464 transformMaskAfterShuffle(CommonMask, CommonMask);
14466 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14467 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14470 V->getType() != V1->
getType()
14472 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14473 ->getNumElements();
14474 if (V->getType() != V1->
getType())
14475 V1 = createShuffle(V1,
nullptr, Mask);
14476 InVectors.
front() = V;
14477 if (InVectors.
size() == 2)
14478 InVectors.
back() = V1;
14485 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14492 for (
Value *V : InVectors)
14493 VF = std::max(VF, getVF(V));
14494 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14496 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
14505 Value *Root =
nullptr) {
14506 return R.gather(VL, Root, ScalarTy,
14508 return createShuffle(V1, V2, Mask);
14517 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14520 IsFinalized =
true;
14523 if (ScalarTyNumElements != 1) {
14527 ExtMask = NewExtMask;
14531 if (InVectors.
size() == 2) {
14532 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14535 Vec = createShuffle(Vec,
nullptr, CommonMask);
14537 transformMaskAfterShuffle(CommonMask, CommonMask);
14539 "Expected vector length for the final value before action.");
14540 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14543 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14544 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14546 Action(Vec, CommonMask);
14547 InVectors.
front() = Vec;
14549 if (!SubVectors.empty()) {
14551 if (InVectors.
size() == 2) {
14552 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14555 Vec = createShuffle(Vec,
nullptr, CommonMask);
14557 transformMaskAfterShuffle(CommonMask, CommonMask);
14558 auto CreateSubVectors = [&](
Value *Vec,
14560 for (
auto [E,
Idx] : SubVectors) {
14561 Value *
V = E->VectorizedValue;
14562 if (
V->getType()->isIntOrIntVectorTy())
14563 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14564 if (isa<PoisonValue>(V))
14566 return !isKnownNonNegative(
14567 V, SimplifyQuery(*R.DL));
14569 unsigned InsertionIndex =
Idx * ScalarTyNumElements;
14571 Builder, Vec, V, InsertionIndex,
14572 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14574 if (!CommonMask.
empty()) {
14576 std::next(CommonMask.
begin(), InsertionIndex),
14577 std::next(CommonMask.
begin(),
14578 (
Idx + E->getVectorFactor()) * ScalarTyNumElements),
14584 if (SubVectorsMask.
empty()) {
14585 Vec = CreateSubVectors(Vec, CommonMask);
14588 copy(SubVectorsMask, SVMask.begin());
14589 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14592 I1 = I2 + CommonMask.
size();
14597 Vec = createShuffle(InsertVec, Vec, SVMask);
14598 transformMaskAfterShuffle(CommonMask, SVMask);
14600 InVectors.
front() = Vec;
14603 if (!ExtMask.
empty()) {
14604 if (CommonMask.
empty()) {
14608 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14611 NewMask[
I] = CommonMask[ExtMask[
I]];
14613 CommonMask.
swap(NewMask);
14616 if (CommonMask.
empty()) {
14617 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14618 return InVectors.
front();
14620 if (InVectors.
size() == 2)
14621 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14622 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14627 "Shuffle construction must be finalized.");
14631BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14632 unsigned NodeIdx) {
14636 if (!S && VL.
front()->getType()->isPointerTy()) {
14637 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14638 if (It != VL.
end())
14643 auto CheckSameVE = [&](
const TreeEntry *VE) {
14644 return VE->isSame(VL) &&
14645 (
any_of(VE->UserTreeIndices,
14646 [E, NodeIdx](
const EdgeInfo &EI) {
14647 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14649 any_of(VectorizableTree,
14650 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14651 return TE->isOperandGatherNode(
14652 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14653 VE->isSame(TE->Scalars);
14656 TreeEntry *VE = getTreeEntry(S.getMainOp());
14657 if (VE && CheckSameVE(VE))
14659 auto It = MultiNodeScalars.
find(S.getMainOp());
14660 if (It != MultiNodeScalars.
end()) {
14661 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14662 return TE != VE && CheckSameVE(TE);
14664 if (
I != It->getSecond().end())
14670Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14671 bool PostponedPHIs) {
14672 ValueList &VL = E->getOperand(NodeIdx);
14673 const unsigned VF = VL.size();
14674 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14679 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14681 ShuffleInstructionBuilder ShuffleBuilder(
14685 ShuffleBuilder.add(V, Mask);
14687 E->CombinedEntriesWithIndices.size());
14688 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14689 [&](
const auto &
P) {
14690 return std::make_pair(VectorizableTree[P.first].get(),
14693 assert((E->CombinedEntriesWithIndices.empty() ||
14694 E->ReorderIndices.empty()) &&
14695 "Expected either combined subnodes or reordering");
14696 return ShuffleBuilder.finalize({}, SubVectors, {});
14700 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14701 if (!VE->ReuseShuffleIndices.empty()) {
14722 if (isa<PoisonValue>(V))
14724 Mask[
I] = VE->findLaneForValue(V);
14726 V = FinalShuffle(V, Mask);
14728 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14729 "Expected vectorization factor less "
14730 "than original vector size.");
14732 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14733 V = FinalShuffle(V, UniformMask);
14739 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14740 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14741 }) == VE->UserTreeIndices.end()) {
14743 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14744 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14745 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14747 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14748 (*It)->VectorizedValue =
V;
14756 auto *
I =
find_if(VectorizableTree,
14757 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14758 return TE->isOperandGatherNode({E, NodeIdx});
14760 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14761 assert(
I->get()->UserTreeIndices.size() == 1 &&
14762 "Expected only single user for the gather node.");
14763 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14767template <
typename BVTy,
typename ResTy,
typename...
Args>
14768ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14770 assert(E->isGather() &&
"Expected gather node.");
14771 unsigned VF = E->getVectorFactor();
14773 bool NeedFreeze =
false;
14775 E->ReuseShuffleIndices.end());
14778 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14780 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14783 E->CombinedEntriesWithIndices.size());
14784 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14785 [&](
const auto &
P) {
14786 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14791 E->ReorderIndices.end());
14792 if (!ReorderMask.empty())
14798 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14799 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14800 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14803 SubVectorsMask.
clear();
14807 unsigned I,
unsigned SliceSize,
14808 bool IsNotPoisonous) {
14810 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14813 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14814 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14815 if (UserTE->getNumOperands() != 2)
14817 if (!IsNotPoisonous) {
14819 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14820 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14821 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14822 }) !=
TE->UserTreeIndices.end();
14824 if (It == VectorizableTree.end())
14827 if (!(*It)->ReorderIndices.empty()) {
14831 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14832 Value *V0 = std::get<0>(
P);
14833 Value *V1 = std::get<1>(
P);
14834 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14835 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14841 if ((
Mask.size() < InputVF &&
14844 (
Mask.size() == InputVF &&
14847 std::next(
Mask.begin(),
I * SliceSize),
14848 std::next(
Mask.begin(),
14855 std::next(
Mask.begin(),
I * SliceSize),
14856 std::next(
Mask.begin(),
14862 BVTy ShuffleBuilder(ScalarTy, Params...);
14863 ResTy Res = ResTy();
14867 Value *ExtractVecBase =
nullptr;
14868 bool UseVecBaseAsInput =
false;
14871 Type *OrigScalarTy = GatheredScalars.front()->getType();
14874 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14879 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14881 bool Resized =
false;
14883 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14884 if (!ExtractShuffles.
empty()) {
14889 if (
const auto *TE = getTreeEntry(
14890 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14893 if (std::optional<ResTy> Delayed =
14894 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14896 PostponedGathers.
insert(E);
14901 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14902 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14903 ExtractVecBase = VecBase;
14904 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14905 if (VF == VecBaseTy->getNumElements() &&
14906 GatheredScalars.size() != VF) {
14908 GatheredScalars.append(VF - GatheredScalars.size(),
14914 if (!ExtractShuffles.
empty() || !E->hasState() ||
14915 E->getOpcode() != Instruction::Load ||
14916 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14917 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14920 return isa<LoadInst>(V) && getTreeEntry(V);
14922 (E->hasState() && E->isAltShuffle()) ||
14923 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14925 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14927 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14929 if (!GatherShuffles.
empty()) {
14930 if (std::optional<ResTy> Delayed =
14931 ShuffleBuilder.needToDelay(E, Entries)) {
14933 PostponedGathers.
insert(E);
14938 if (GatherShuffles.
size() == 1 &&
14940 Entries.front().front()->isSame(E->Scalars)) {
14943 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14946 Mask.resize(E->Scalars.size());
14947 const TreeEntry *FrontTE = Entries.front().front();
14948 if (FrontTE->ReorderIndices.empty() &&
14949 ((FrontTE->ReuseShuffleIndices.empty() &&
14950 E->Scalars.size() == FrontTE->Scalars.size()) ||
14951 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14952 std::iota(
Mask.begin(),
Mask.end(), 0);
14955 if (isa<PoisonValue>(V)) {
14959 Mask[
I] = FrontTE->findLaneForValue(V);
14962 ShuffleBuilder.add(*FrontTE, Mask);
14964 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14968 if (GatheredScalars.size() != VF &&
14970 return any_of(TEs, [&](
const TreeEntry *TE) {
14971 return TE->getVectorFactor() == VF;
14974 GatheredScalars.append(VF - GatheredScalars.size(),
14978 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14986 bool IsRootPoison) {
14989 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14996 int NumNonConsts = 0;
14999 if (isa<UndefValue>(V)) {
15000 if (!isa<PoisonValue>(V)) {
15015 Scalars.
front() = OrigV;
15018 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
15019 Scalars[Res.first->second] = OrigV;
15020 ReuseMask[
I] = Res.first->second;
15023 if (NumNonConsts == 1) {
15028 if (!UndefPos.
empty() && UndefPos.
front() == 0)
15031 ReuseMask[SinglePos] = SinglePos;
15032 }
else if (!UndefPos.
empty() && IsSplat) {
15037 return !isa<UndefValue>(V) &&
15039 (E->UserTreeIndices.size() == 1 &&
15043 return E->UserTreeIndices.front().EdgeIdx !=
15044 U.getOperandNo() &&
15046 E->UserTreeIndices.front().UserTE->Scalars,
15050 if (It != Scalars.
end()) {
15052 int Pos = std::distance(Scalars.
begin(), It);
15053 for (
int I : UndefPos) {
15055 ReuseMask[
I] = Pos;
15064 for (
int I : UndefPos) {
15066 if (isa<UndefValue>(Scalars[
I]))
15073 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15074 bool IsNonPoisoned =
true;
15075 bool IsUsedInExpr =
true;
15076 Value *Vec1 =
nullptr;
15077 if (!ExtractShuffles.
empty()) {
15081 Value *Vec2 =
nullptr;
15082 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15086 if (UseVecBaseAsInput) {
15087 Vec1 = ExtractVecBase;
15089 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15092 if (isa<UndefValue>(E->Scalars[
I]))
15094 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15095 Value *VecOp = EI->getVectorOperand();
15096 if (
const auto *TE = getTreeEntry(VecOp))
15097 if (
TE->VectorizedValue)
15098 VecOp =
TE->VectorizedValue;
15101 }
else if (Vec1 != VecOp) {
15102 assert((!Vec2 || Vec2 == VecOp) &&
15103 "Expected only 1 or 2 vectors shuffle.");
15109 IsUsedInExpr =
false;
15112 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15115 IsUsedInExpr &= FindReusedSplat(
15117 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15118 ExtractMask.size(), IsNotPoisonedVec);
15119 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15120 IsNonPoisoned &= IsNotPoisonedVec;
15122 IsUsedInExpr =
false;
15127 if (!GatherShuffles.
empty()) {
15130 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15133 "No shuffles with empty entries list expected.");
15137 "Expected shuffle of 1 or 2 entries.");
15141 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15142 if (TEs.
size() == 1) {
15143 bool IsNotPoisonedVec =
15144 TEs.
front()->VectorizedValue
15148 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15149 SliceSize, IsNotPoisonedVec);
15150 ShuffleBuilder.add(*TEs.
front(), VecMask);
15151 IsNonPoisoned &= IsNotPoisonedVec;
15153 IsUsedInExpr =
false;
15154 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15155 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15166 int EMSz = ExtractMask.size();
15167 int MSz =
Mask.size();
15170 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15171 bool IsIdentityShuffle =
15172 ((UseVecBaseAsInput ||
15174 [](
const std::optional<TTI::ShuffleKind> &SK) {
15178 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15180 (!GatherShuffles.
empty() &&
15182 [](
const std::optional<TTI::ShuffleKind> &SK) {
15186 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15188 bool EnoughConstsForShuffle =
15192 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15196 return isa<Constant>(V) && !isa<UndefValue>(V);
15198 (!IsIdentityShuffle ||
15199 (GatheredScalars.size() == 2 &&
15201 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15203 return isa<Constant>(V) && !isa<PoisonValue>(V);
15207 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15208 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15214 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15216 TryPackScalars(GatheredScalars, BVMask,
true);
15217 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15218 ShuffleBuilder.add(BV, BVMask);
15221 return isa<PoisonValue>(V) ||
15222 (IsSingleShuffle && ((IsIdentityShuffle &&
15223 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15225 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15228 Res = ShuffleBuilder.finalize(
15229 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15231 TryPackScalars(NonConstants, Mask,
false);
15232 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15237 TryPackScalars(GatheredScalars, ReuseMask,
true);
15238 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15239 ShuffleBuilder.add(BV, ReuseMask);
15240 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15245 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15246 if (!isa<PoisonValue>(V))
15249 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15250 ShuffleBuilder.add(BV, Mask);
15251 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15256 Res = ShuffleBuilder.createFreeze(Res);
15260Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15261 bool PostponedPHIs) {
15262 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15264 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15272 for (
Value *V : VL)
15273 if (isa<Instruction>(V))
15281 if (E->VectorizedValue &&
15282 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15283 E->isAltShuffle())) {
15284 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15285 return E->VectorizedValue;
15288 Value *
V = E->Scalars.front();
15289 Type *ScalarTy =
V->getType();
15290 if (!isa<CmpInst>(V))
15292 auto It = MinBWs.
find(E);
15293 if (It != MinBWs.
end()) {
15294 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15300 if (E->isGather()) {
15302 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15303 setInsertPointAfterBundle(E);
15304 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15305 E->VectorizedValue = Vec;
15309 bool IsReverseOrder =
15310 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15311 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15312 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15313 if (E->getOpcode() == Instruction::Store &&
15314 E->State == TreeEntry::Vectorize) {
15316 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15317 E->ReorderIndices.size());
15318 ShuffleBuilder.add(V, Mask);
15319 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15320 ShuffleBuilder.addOrdered(V, {});
15322 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15325 E->CombinedEntriesWithIndices.size());
15327 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15328 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15331 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15332 "Expected either combined subnodes or reordering");
15333 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15336 assert(!E->isGather() &&
"Unhandled state");
15337 unsigned ShuffleOrOp =
15338 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15340 auto GetOperandSignedness = [&](
unsigned Idx) {
15341 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15342 bool IsSigned =
false;
15343 auto It = MinBWs.
find(OpE);
15344 if (It != MinBWs.
end())
15345 IsSigned = It->second.second;
15348 if (isa<PoisonValue>(V))
15350 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15354 switch (ShuffleOrOp) {
15355 case Instruction::PHI: {
15356 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15357 E != VectorizableTree.front().get() ||
15358 !E->UserTreeIndices.empty()) &&
15359 "PHI reordering is free.");
15360 if (PostponedPHIs && E->VectorizedValue)
15361 return E->VectorizedValue;
15362 auto *PH = cast<PHINode>(VL0);
15364 PH->getParent()->getFirstNonPHIIt());
15366 if (PostponedPHIs || !E->VectorizedValue) {
15373 PH->getParent()->getFirstInsertionPt());
15376 V = FinalShuffle(V, E);
15378 E->VectorizedValue =
V;
15382 PHINode *NewPhi = cast<PHINode>(E->PHI);
15391 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15397 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15401 if (!VisitedBBs.
insert(IBB).second) {
15408 Value *Vec = vectorizeOperand(E,
I,
true);
15409 if (VecTy != Vec->
getType()) {
15411 MinBWs.
contains(getOperandEntry(E,
I))) &&
15412 "Expected item in MinBWs.");
15413 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15419 "Invalid number of incoming values");
15420 assert(E->VectorizedValue &&
"Expected vectorized value.");
15421 return E->VectorizedValue;
15424 case Instruction::ExtractElement: {
15425 Value *
V = E->getSingleOperand(0);
15426 if (
const TreeEntry *TE = getTreeEntry(V))
15427 V =
TE->VectorizedValue;
15428 setInsertPointAfterBundle(E);
15429 V = FinalShuffle(V, E);
15430 E->VectorizedValue =
V;
15433 case Instruction::ExtractValue: {
15434 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15439 NewV = FinalShuffle(NewV, E);
15440 E->VectorizedValue = NewV;
15443 case Instruction::InsertElement: {
15444 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15446 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15448 Type *ScalarTy =
Op.front()->getType();
15449 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15451 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15452 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15457 cast<FixedVectorType>(
V->getType())->getNumElements()),
15462 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15463 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15465 const unsigned NumElts =
15466 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15467 const unsigned NumScalars = E->Scalars.size();
15470 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15474 if (!E->ReorderIndices.empty()) {
15479 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15482 bool IsIdentity =
true;
15484 Mask.swap(PrevMask);
15485 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15488 IsIdentity &= InsertIdx -
Offset ==
I;
15491 if (!IsIdentity || NumElts != NumScalars) {
15493 bool IsVNonPoisonous =
15496 if (NumElts != NumScalars &&
Offset == 0) {
15505 InsertMask[*InsertIdx] = *InsertIdx;
15506 if (!
Ins->hasOneUse())
15508 Ins = dyn_cast_or_null<InsertElementInst>(
15509 Ins->getUniqueUndroppableUser());
15512 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15514 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15517 if (!IsFirstPoison.
all()) {
15519 for (
unsigned I = 0;
I < NumElts;
I++) {
15521 IsFirstUndef.
test(
I)) {
15522 if (IsVNonPoisonous) {
15523 InsertMask[
I] =
I < NumScalars ?
I : 0;
15528 if (
Idx >= NumScalars)
15529 Idx = NumScalars - 1;
15530 InsertMask[
I] = NumScalars +
Idx;
15544 if (
auto *
I = dyn_cast<Instruction>(V)) {
15545 GatherShuffleExtractSeq.
insert(
I);
15546 CSEBlocks.
insert(
I->getParent());
15551 for (
unsigned I = 0;
I < NumElts;
I++) {
15556 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15559 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15560 NumElts != NumScalars) {
15561 if (IsFirstUndef.
all()) {
15564 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15565 if (!IsFirstPoison.
all()) {
15566 for (
unsigned I = 0;
I < NumElts;
I++) {
15568 InsertMask[
I] =
I + NumElts;
15575 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15576 if (
auto *
I = dyn_cast<Instruction>(V)) {
15577 GatherShuffleExtractSeq.
insert(
I);
15578 CSEBlocks.
insert(
I->getParent());
15583 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15584 for (
unsigned I = 0;
I < NumElts;
I++) {
15588 InsertMask[
I] += NumElts;
15591 FirstInsert->getOperand(0), V, InsertMask,
15592 cast<Instruction>(E->Scalars.back())->getName());
15593 if (
auto *
I = dyn_cast<Instruction>(V)) {
15594 GatherShuffleExtractSeq.
insert(
I);
15595 CSEBlocks.
insert(
I->getParent());
15600 ++NumVectorInstructions;
15601 E->VectorizedValue =
V;
15604 case Instruction::ZExt:
15605 case Instruction::SExt:
15606 case Instruction::FPToUI:
15607 case Instruction::FPToSI:
15608 case Instruction::FPExt:
15609 case Instruction::PtrToInt:
15610 case Instruction::IntToPtr:
15611 case Instruction::SIToFP:
15612 case Instruction::UIToFP:
15613 case Instruction::Trunc:
15614 case Instruction::FPTrunc:
15615 case Instruction::BitCast: {
15616 setInsertPointAfterBundle(E);
15618 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15619 if (E->VectorizedValue) {
15620 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15621 return E->VectorizedValue;
15624 auto *CI = cast<CastInst>(VL0);
15626 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15627 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15629 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15632 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15633 if (SrcIt != MinBWs.
end())
15634 SrcBWSz = SrcIt->second.first;
15636 if (BWSz == SrcBWSz) {
15637 VecOpcode = Instruction::BitCast;
15638 }
else if (BWSz < SrcBWSz) {
15639 VecOpcode = Instruction::Trunc;
15640 }
else if (It != MinBWs.
end()) {
15641 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15642 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15643 }
else if (SrcIt != MinBWs.
end()) {
15644 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15646 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15648 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15649 !SrcIt->second.second) {
15650 VecOpcode = Instruction::UIToFP;
15652 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15654 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15655 V = FinalShuffle(V, E);
15657 E->VectorizedValue =
V;
15658 ++NumVectorInstructions;
15661 case Instruction::FCmp:
15662 case Instruction::ICmp: {
15663 setInsertPointAfterBundle(E);
15665 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15666 if (E->VectorizedValue) {
15667 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15668 return E->VectorizedValue;
15670 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15671 if (E->VectorizedValue) {
15672 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15673 return E->VectorizedValue;
15675 if (
L->getType() !=
R->getType()) {
15677 getOperandEntry(E, 1)->
isGather() ||
15678 MinBWs.
contains(getOperandEntry(E, 0)) ||
15679 MinBWs.
contains(getOperandEntry(E, 1))) &&
15680 "Expected item in MinBWs.");
15681 if (cast<VectorType>(
L->getType())
15683 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15685 ->getIntegerBitWidth()) {
15686 Type *CastTy =
R->getType();
15689 Type *CastTy =
L->getType();
15697 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15698 ICmp->setSameSign(
false);
15700 VecTy = cast<FixedVectorType>(
V->getType());
15701 V = FinalShuffle(V, E);
15703 E->VectorizedValue =
V;
15704 ++NumVectorInstructions;
15707 case Instruction::Select: {
15708 setInsertPointAfterBundle(E);
15710 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15711 if (E->VectorizedValue) {
15712 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15713 return E->VectorizedValue;
15715 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15716 if (E->VectorizedValue) {
15717 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15718 return E->VectorizedValue;
15720 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15721 if (E->VectorizedValue) {
15722 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15723 return E->VectorizedValue;
15727 getOperandEntry(E, 2)->
isGather() ||
15728 MinBWs.
contains(getOperandEntry(E, 1)) ||
15729 MinBWs.
contains(getOperandEntry(E, 2))) &&
15730 "Expected item in MinBWs.");
15731 if (True->
getType() != VecTy)
15732 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15733 if (False->
getType() != VecTy)
15734 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15739 assert(TrueNumElements >= CondNumElements &&
15740 TrueNumElements % CondNumElements == 0 &&
15741 "Cannot vectorize Instruction::Select");
15743 "Cannot vectorize Instruction::Select");
15744 if (CondNumElements != TrueNumElements) {
15752 "Cannot vectorize Instruction::Select");
15754 V = FinalShuffle(V, E);
15756 E->VectorizedValue =
V;
15757 ++NumVectorInstructions;
15760 case Instruction::FNeg: {
15761 setInsertPointAfterBundle(E);
15763 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15765 if (E->VectorizedValue) {
15766 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15767 return E->VectorizedValue;
15773 if (
auto *
I = dyn_cast<Instruction>(V))
15776 V = FinalShuffle(V, E);
15778 E->VectorizedValue =
V;
15779 ++NumVectorInstructions;
15783 case Instruction::Freeze: {
15784 setInsertPointAfterBundle(E);
15786 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15788 if (E->VectorizedValue) {
15789 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15790 return E->VectorizedValue;
15793 if (
Op->getType() != VecTy) {
15795 MinBWs.
contains(getOperandEntry(E, 0))) &&
15796 "Expected item in MinBWs.");
15800 V = FinalShuffle(V, E);
15802 E->VectorizedValue =
V;
15803 ++NumVectorInstructions;
15807 case Instruction::Add:
15808 case Instruction::FAdd:
15809 case Instruction::Sub:
15810 case Instruction::FSub:
15811 case Instruction::Mul:
15812 case Instruction::FMul:
15813 case Instruction::UDiv:
15814 case Instruction::SDiv:
15815 case Instruction::FDiv:
15816 case Instruction::URem:
15817 case Instruction::SRem:
15818 case Instruction::FRem:
15819 case Instruction::Shl:
15820 case Instruction::LShr:
15821 case Instruction::AShr:
15822 case Instruction::And:
15823 case Instruction::Or:
15824 case Instruction::Xor: {
15825 setInsertPointAfterBundle(E);
15827 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15828 if (E->VectorizedValue) {
15829 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15830 return E->VectorizedValue;
15832 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15833 if (E->VectorizedValue) {
15834 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15835 return E->VectorizedValue;
15837 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15838 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15841 auto *CI = dyn_cast<ConstantInt>(
Op);
15842 return CI && CI->getValue().countr_one() >= It->second.first;
15844 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15845 E->VectorizedValue =
V;
15846 ++NumVectorInstructions;
15853 getOperandEntry(E, 1)->
isGather() ||
15854 MinBWs.
contains(getOperandEntry(E, 0)) ||
15855 MinBWs.
contains(getOperandEntry(E, 1))) &&
15856 "Expected item in MinBWs.");
15867 if (
auto *
I = dyn_cast<Instruction>(V)) {
15870 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15872 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15874 I->setHasNoUnsignedWrap(
false);
15877 V = FinalShuffle(V, E);
15879 E->VectorizedValue =
V;
15880 ++NumVectorInstructions;
15884 case Instruction::Load: {
15887 setInsertPointAfterBundle(E);
15889 LoadInst *LI = cast<LoadInst>(VL0);
15892 if (E->State == TreeEntry::Vectorize) {
15894 }
else if (E->State == TreeEntry::StridedVectorize) {
15895 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15896 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15897 PO = IsReverseOrder ? PtrN : Ptr0;
15903 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15905 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15906 DL->getTypeAllocSize(ScalarTy));
15910 return cast<LoadInst>(V)->getPointerOperand();
15913 std::optional<Value *> Stride =
15922 (IsReverseOrder ? -1 : 1) *
15923 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15925 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15927 Intrinsic::experimental_vp_strided_load,
15928 {VecTy, PO->
getType(), StrideTy},
15930 Builder.
getInt32(E->Scalars.size())});
15936 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15937 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15938 if (E->VectorizedValue) {
15939 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15940 return E->VectorizedValue;
15942 if (isa<FixedVectorType>(ScalarTy)) {
15946 unsigned ScalarTyNumElements =
15947 cast<FixedVectorType>(ScalarTy)->getNumElements();
15948 unsigned VecTyNumElements =
15949 cast<FixedVectorType>(VecTy)->getNumElements();
15950 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15951 "Cannot expand getelementptr.");
15952 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15955 return Builder.getInt64(I % ScalarTyNumElements);
15964 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15969 V = FinalShuffle(V, E);
15970 E->VectorizedValue =
V;
15971 ++NumVectorInstructions;
15974 case Instruction::Store: {
15975 auto *
SI = cast<StoreInst>(VL0);
15977 setInsertPointAfterBundle(E);
15979 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15980 if (VecValue->
getType() != VecTy)
15982 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15983 VecValue = FinalShuffle(VecValue, E);
15987 if (E->State == TreeEntry::Vectorize) {
15990 assert(E->State == TreeEntry::StridedVectorize &&
15991 "Expected either strided or consecutive stores.");
15992 if (!E->ReorderIndices.empty()) {
15993 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15994 Ptr =
SI->getPointerOperand();
15996 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15997 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15999 Intrinsic::experimental_vp_strided_store,
16000 {VecTy,
Ptr->getType(), StrideTy},
16003 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
16005 Builder.
getInt32(E->Scalars.size())});
16014 E->VectorizedValue =
V;
16015 ++NumVectorInstructions;
16018 case Instruction::GetElementPtr: {
16019 auto *GEP0 = cast<GetElementPtrInst>(VL0);
16020 setInsertPointAfterBundle(E);
16022 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16023 if (E->VectorizedValue) {
16024 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16025 return E->VectorizedValue;
16029 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
16030 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16031 if (E->VectorizedValue) {
16032 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16033 return E->VectorizedValue;
16038 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16039 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
16041 for (
Value *V : E->Scalars) {
16042 if (isa<GetElementPtrInst>(V))
16048 V = FinalShuffle(V, E);
16050 E->VectorizedValue =
V;
16051 ++NumVectorInstructions;
16055 case Instruction::Call: {
16056 CallInst *CI = cast<CallInst>(VL0);
16057 setInsertPointAfterBundle(E);
16063 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16066 VecCallCosts.first <= VecCallCosts.second;
16068 Value *ScalarArg =
nullptr;
16074 auto *CEI = cast<CallInst>(VL0);
16075 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16080 ScalarArg = CEI->getArgOperand(
I);
16083 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16084 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16092 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16093 if (E->VectorizedValue) {
16094 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16095 return E->VectorizedValue;
16097 ScalarArg = CEI->getArgOperand(
I);
16098 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16100 It == MinBWs.
end()) {
16103 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16104 }
else if (It != MinBWs.
end()) {
16105 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16114 if (!UseIntrinsic) {
16130 V = FinalShuffle(V, E);
16132 E->VectorizedValue =
V;
16133 ++NumVectorInstructions;
16136 case Instruction::ShuffleVector: {
16138 if (
SLPReVec && !E->isAltShuffle()) {
16139 setInsertPointAfterBundle(E);
16140 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16141 if (E->VectorizedValue) {
16142 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16143 return E->VectorizedValue;
16146 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16147 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16148 "Not supported shufflevector usage.");
16151 return SVSrc->getShuffleMask()[Mask];
16158 if (
auto *
I = dyn_cast<Instruction>(V))
16160 V = FinalShuffle(V, E);
16162 assert(E->isAltShuffle() &&
16167 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16168 "Invalid Shuffle Vector Operand");
16172 setInsertPointAfterBundle(E);
16173 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16174 if (E->VectorizedValue) {
16175 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16176 return E->VectorizedValue;
16178 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16180 setInsertPointAfterBundle(E);
16181 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16183 if (E->VectorizedValue) {
16184 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16185 return E->VectorizedValue;
16192 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16193 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16194 MinBWs.
contains(getOperandEntry(E, 0)) ||
16195 MinBWs.
contains(getOperandEntry(E, 1))) &&
16196 "Expected item in MinBWs.");
16197 Type *CastTy = VecTy;
16201 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16203 ->getIntegerBitWidth())
16220 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16221 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16222 auto *AltCI = cast<CmpInst>(E->getAltOp());
16224 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16227 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16228 cast<VectorType>(
LHS->
getType())->getElementType());
16229 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16230 if (BWSz <= SrcBWSz) {
16231 if (BWSz < SrcBWSz)
16234 "Expected same type as operand.");
16235 if (
auto *
I = dyn_cast<Instruction>(LHS))
16237 LHS = FinalShuffle(LHS, E);
16238 E->VectorizedValue =
LHS;
16239 ++NumVectorInstructions;
16250 for (
Value *V : {V0, V1}) {
16251 if (
auto *
I = dyn_cast<Instruction>(V)) {
16252 GatherShuffleExtractSeq.
insert(
I);
16253 CSEBlocks.
insert(
I->getParent());
16262 E->buildAltOpShuffleMask(
16264 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16268 Mask, &OpScalars, &AltScalars);
16272 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16274 if (
auto *
I = dyn_cast<Instruction>(Vec);
16275 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16277 if (isa<PoisonValue>(V))
16279 auto *IV = cast<Instruction>(V);
16280 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16282 I->setHasNoUnsignedWrap(
false);
16284 DropNuwFlag(V0, E->getOpcode());
16285 DropNuwFlag(V1, E->getAltOpcode());
16287 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16292 if (
auto *
I = dyn_cast<Instruction>(V)) {
16294 GatherShuffleExtractSeq.
insert(
I);
16295 CSEBlocks.
insert(
I->getParent());
16299 E->VectorizedValue =
V;
16300 ++NumVectorInstructions;
16319 for (
auto &BSIter : BlocksSchedules) {
16320 scheduleBlock(BSIter.second.get());
16324 EntryToLastInstruction.
clear();
16334 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16335 if (GatheredLoadsEntriesFirst.has_value() &&
16336 TE->Idx >= *GatheredLoadsEntriesFirst &&
16337 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16338 assert((!TE->UserTreeIndices.empty() ||
16339 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16340 "Expected gathered load node.");
16346 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16347 if (TE->State == TreeEntry::Vectorize &&
16348 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16349 TE->VectorizedValue)
16355 for (
const TreeEntry *E : PostponedNodes) {
16356 auto *TE =
const_cast<TreeEntry *
>(E);
16357 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16358 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16359 TE->UserTreeIndices.front().EdgeIdx)) &&
16360 VecTE->isSame(TE->Scalars))
16364 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16365 TE->VectorizedValue =
nullptr;
16367 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16376 if (isa<PHINode>(UserI)) {
16379 for (
User *U : PrevVec->users()) {
16382 auto *UI = dyn_cast<Instruction>(U);
16383 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16385 if (UI->comesBefore(InsertPt))
16394 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16399 if (Vec->
getType() != PrevVec->getType()) {
16401 PrevVec->getType()->isIntOrIntVectorTy() &&
16402 "Expected integer vector types only.");
16403 std::optional<bool> IsSigned;
16404 for (
Value *V : TE->Scalars) {
16405 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16406 auto It = MinBWs.
find(BaseTE);
16407 if (It != MinBWs.
end()) {
16408 IsSigned = IsSigned.value_or(
false) || It->second.second;
16412 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16413 auto It = MinBWs.
find(MNTE);
16414 if (It != MinBWs.
end()) {
16415 IsSigned = IsSigned.value_or(
false) || It->second.second;
16420 if (IsSigned.value_or(
false))
16423 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16424 auto It = MinBWs.
find(BVE);
16425 if (It != MinBWs.
end()) {
16426 IsSigned = IsSigned.value_or(
false) || It->second.second;
16431 if (IsSigned.value_or(
false))
16433 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16435 IsSigned.value_or(
false) ||
16439 if (IsSigned.value_or(
false))
16443 if (IsSigned.value_or(
false)) {
16445 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16446 if (It != MinBWs.
end())
16447 IsSigned = It->second.second;
16450 "Expected user node or perfect diamond match in MinBWs.");
16454 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16457 auto It = PostponedValues.
find(PrevVec);
16458 if (It != PostponedValues.
end()) {
16459 for (TreeEntry *VTE : It->getSecond())
16460 VTE->VectorizedValue = Vec;
16480 for (
const auto &ExternalUse : ExternalUses) {
16481 Value *Scalar = ExternalUse.Scalar;
16488 TreeEntry *E = getTreeEntry(Scalar);
16489 assert(E &&
"Invalid scalar");
16490 assert(!E->isGather() &&
"Extracting from a gather list");
16492 if (E->getOpcode() == Instruction::GetElementPtr &&
16493 !isa<GetElementPtrInst>(Scalar))
16496 Value *Vec = E->VectorizedValue;
16497 assert(Vec &&
"Can't find vectorizable value");
16500 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16501 if (Scalar->getType() != Vec->
getType()) {
16502 Value *Ex =
nullptr;
16503 Value *ExV =
nullptr;
16504 auto *Inst = dyn_cast<Instruction>(Scalar);
16505 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16506 auto It = ScalarToEEs.
find(Scalar);
16507 if (It != ScalarToEEs.
end()) {
16510 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16512 if (EEIt != It->second.end()) {
16513 Value *PrevV = EEIt->second.first;
16514 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16515 I && !ReplaceInst &&
16520 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16524 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16532 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16533 IgnoredExtracts.
insert(EE);
16536 auto *CloneInst = Inst->clone();
16537 CloneInst->insertBefore(Inst->getIterator());
16538 if (Inst->hasName())
16542 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16543 ES && isa<Instruction>(Vec)) {
16544 Value *V = ES->getVectorOperand();
16545 auto *IVec = cast<Instruction>(Vec);
16546 if (
const TreeEntry *ETE = getTreeEntry(V))
16547 V = ETE->VectorizedValue;
16548 if (
auto *
IV = dyn_cast<Instruction>(V);
16549 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16550 IV->comesBefore(IVec))
16554 }
else if (
auto *VecTy =
16555 dyn_cast<FixedVectorType>(Scalar->getType())) {
16562 ExternalUse.Lane * VecTyNumElements);
16569 if (Scalar->getType() != Ex->
getType())
16571 Ex, Scalar->getType(),
16573 auto *
I = dyn_cast<Instruction>(Ex);
16575 : &
F->getEntryBlock(),
16576 std::make_pair(Ex, ExV));
16580 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16582 GatherShuffleExtractSeq.
insert(ExI);
16583 CSEBlocks.
insert(ExI->getParent());
16587 assert(isa<FixedVectorType>(Scalar->getType()) &&
16588 isa<InsertElementInst>(Scalar) &&
16589 "In-tree scalar of vector type is not insertelement?");
16590 auto *IE = cast<InsertElementInst>(Scalar);
16598 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16602 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16605 if (ExternalUsesAsOriginalScalar.contains(U))
16607 TreeEntry *UseEntry = getTreeEntry(U);
16609 (UseEntry->State == TreeEntry::Vectorize ||
16611 TreeEntry::StridedVectorize) &&
16612 (E->State == TreeEntry::Vectorize ||
16613 E->State == TreeEntry::StridedVectorize) &&
16614 doesInTreeUserNeedToExtract(
16615 Scalar, getRootEntryInstruction(*UseEntry),
16618 "Scalar with nullptr User must be registered in "
16619 "ExternallyUsedValues map or remain as scalar in vectorized "
16621 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16622 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16623 if (
PHI->getParent()->isLandingPad())
16627 PHI->getParent()->getLandingPadInst()->getIterator()));
16630 PHI->getParent()->getFirstNonPHIIt());
16633 std::next(VecI->getIterator()));
16638 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16640 if (Scalar != NewInst) {
16641 assert((!isa<ExtractElementInst>(Scalar) ||
16642 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16643 "Extractelements should not be replaced.");
16644 Scalar->replaceAllUsesWith(NewInst);
16649 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16652 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16653 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16654 if (!UsedInserts.
insert(VU).second)
16657 auto BWIt = MinBWs.
find(E);
16659 auto *ScalarTy = FTy->getElementType();
16660 auto Key = std::make_pair(Vec, ScalarTy);
16661 auto VecIt = VectorCasts.
find(Key);
16662 if (VecIt == VectorCasts.
end()) {
16664 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16665 if (IVec->getParent()->isLandingPad())
16667 std::next(IVec->getParent()
16668 ->getLandingPadInst()
16672 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16673 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16680 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16681 BWIt->second.second);
16684 Vec = VecIt->second;
16691 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16698 unsigned Idx = *InsertIdx;
16699 if (It == ShuffledInserts.
end()) {
16701 It = std::next(ShuffledInserts.
begin(),
16702 ShuffledInserts.
size() - 1);
16707 Mask[
Idx] = ExternalUse.Lane;
16708 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16717 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16719 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16720 if (PH->getIncomingValue(
I) == Scalar) {
16722 PH->getIncomingBlock(
I)->getTerminator();
16723 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16725 std::next(VecI->getIterator()));
16729 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16730 PH->setOperand(
I, NewInst);
16735 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16740 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16750 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16751 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16753 CombinedMask1[
I] = Mask[
I];
16755 CombinedMask2[
I] = Mask[
I] - VF;
16758 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16759 ShuffleBuilder.
add(V1, CombinedMask1);
16761 ShuffleBuilder.
add(V2, CombinedMask2);
16762 return ShuffleBuilder.
finalize({}, {}, {});
16766 bool ForSingleMask) {
16767 unsigned VF = Mask.size();
16768 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16770 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16771 Vec = CreateShuffle(Vec,
nullptr, Mask);
16772 return std::make_pair(Vec,
true);
16774 if (!ForSingleMask) {
16776 for (
unsigned I = 0;
I < VF; ++
I) {
16778 ResizeMask[Mask[
I]] = Mask[
I];
16780 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16784 return std::make_pair(Vec,
false);
16788 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16794 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16795 Value *NewInst = performExtractsShuffleAction<Value>(
16799 return cast<VectorType>(Vec->getType())
16800 ->getElementCount()
16801 .getKnownMinValue();
16806 assert((Vals.size() == 1 || Vals.size() == 2) &&
16807 "Expected exactly 1 or 2 input values.");
16808 if (Vals.size() == 1) {
16811 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16812 ->getNumElements() ||
16813 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16814 return CreateShuffle(Vals.front(), nullptr, Mask);
16815 return Vals.front();
16817 return CreateShuffle(Vals.
front() ? Vals.
front()
16819 Vals.
back(), Mask);
16821 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16824 if (It != ShuffledInserts[
I].InsertElements.
rend())
16827 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16828 assert(
II &&
"Must be an insertelement instruction.");
16833 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16836 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16837 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16838 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16839 II->moveAfter(NewI);
16842 LastInsert->replaceAllUsesWith(NewInst);
16844 IE->replaceUsesOfWith(IE->getOperand(0),
16846 IE->replaceUsesOfWith(IE->getOperand(1),
16850 CSEBlocks.
insert(LastInsert->getParent());
16855 for (
auto &TEPtr : VectorizableTree) {
16856 TreeEntry *Entry = TEPtr.get();
16859 if (Entry->isGather())
16862 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16865 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16866 Value *Scalar = Entry->Scalars[Lane];
16868 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16869 !isa<GetElementPtrInst>(Scalar))
16871 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16872 EE && IgnoredExtracts.contains(EE))
16874 if (isa<PoisonValue>(Scalar))
16877 Type *Ty = Scalar->getType();
16879 for (
User *U : Scalar->users()) {
16883 assert((getTreeEntry(U) ||
16884 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16885 (isa_and_nonnull<Instruction>(U) &&
16886 isDeleted(cast<Instruction>(U)))) &&
16887 "Deleting out-of-tree value");
16891 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16892 auto *
I = cast<Instruction>(Scalar);
16899 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16900 V->mergeDIAssignID(RemovedInsts);
16903 if (UserIgnoreList) {
16905 const TreeEntry *
IE = getTreeEntry(
I);
16906 if (
IE->Idx != 0 &&
16907 !(VectorizableTree.front()->isGather() &&
16908 !
IE->UserTreeIndices.empty() &&
16909 (ValueToGatherNodes.lookup(
I).contains(
16910 VectorizableTree.front().get()) ||
16912 [&](
const EdgeInfo &EI) {
16913 return EI.UserTE == VectorizableTree.front().get() &&
16914 EI.EdgeIdx == UINT_MAX;
16916 !(GatheredLoadsEntriesFirst.has_value() &&
16917 IE->Idx >= *GatheredLoadsEntriesFirst &&
16918 VectorizableTree.front()->isGather() &&
16924 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16925 (match(U.getUser(), m_LogicalAnd()) ||
16926 match(U.getUser(), m_LogicalOr())) &&
16927 U.getOperandNo() == 0;
16928 if (IsPoisoningLogicalOp) {
16929 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16932 return UserIgnoreList->contains(
U.getUser());
16944 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16947 InstrElementSize.
clear();
16949 const TreeEntry &RootTE = *VectorizableTree.front();
16950 Value *Vec = RootTE.VectorizedValue;
16951 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16952 It != MinBWs.end() &&
16953 ReductionBitWidth != It->second.first) {
16956 ReductionRoot->getIterator());
16960 cast<VectorType>(Vec->
getType())->getElementCount()),
16961 It->second.second);
16968 <<
" gather sequences instructions.\n");
16975 Loop *L = LI->getLoopFor(
I->getParent());
16980 BasicBlock *PreHeader = L->getLoopPreheader();
16988 auto *OpI = dyn_cast<Instruction>(V);
16989 return OpI && L->contains(OpI);
16995 CSEBlocks.
insert(PreHeader);
17010 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
17011 "Different nodes should have different DFS numbers");
17012 return A->getDFSNumIn() <
B->getDFSNumIn();
17023 if (I1->getType() != I2->getType())
17025 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17026 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17028 return I1->isIdenticalTo(I2);
17029 if (SI1->isIdenticalTo(SI2))
17031 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
17032 if (SI1->getOperand(
I) != SI2->getOperand(
I))
17035 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17039 unsigned LastUndefsCnt = 0;
17040 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
17046 NewMask[
I] != SM1[
I])
17049 NewMask[
I] = SM1[
I];
17053 return SM1.
size() - LastUndefsCnt > 1 &&
17057 SM1.
size() - LastUndefsCnt));
17063 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17066 "Worklist not sorted properly!");
17072 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17073 !GatherShuffleExtractSeq.contains(&In))
17078 bool Replaced =
false;
17081 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17082 DT->
dominates(V->getParent(), In.getParent())) {
17083 In.replaceAllUsesWith(V);
17085 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17086 if (!NewMask.
empty())
17087 SI->setShuffleMask(NewMask);
17091 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17092 GatherShuffleExtractSeq.contains(V) &&
17093 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17094 DT->
dominates(In.getParent(), V->getParent())) {
17096 V->replaceAllUsesWith(&In);
17098 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17099 if (!NewMask.
empty())
17100 SI->setShuffleMask(NewMask);
17108 Visited.push_back(&In);
17113 GatherShuffleExtractSeq.clear();
17116BoUpSLP::ScheduleData *
17118 ScheduleData *Bundle =
nullptr;
17119 ScheduleData *PrevInBundle =
nullptr;
17120 for (
Value *V : VL) {
17123 ScheduleData *BundleMember = getScheduleData(V);
17125 "no ScheduleData for bundle member "
17126 "(maybe not in same basic block)");
17127 assert(BundleMember->isSchedulingEntity() &&
17128 "bundle member already part of other bundle");
17129 if (PrevInBundle) {
17130 PrevInBundle->NextInBundle = BundleMember;
17132 Bundle = BundleMember;
17136 BundleMember->FirstInBundle = Bundle;
17137 PrevInBundle = BundleMember;
17139 assert(Bundle &&
"Failed to find schedule bundle");
17145std::optional<BoUpSLP::ScheduleData *>
17147 const InstructionsState &S) {
17150 if (isa<PHINode>(S.getMainOp()) ||
17156 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17158 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17159 ScheduleData *Bundle) {
17165 if (ScheduleEnd != OldScheduleEnd) {
17166 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17167 if (ScheduleData *SD = getScheduleData(
I))
17168 SD->clearDependencies();
17173 <<
" in block " << BB->
getName() <<
"\n");
17174 calculateDependencies(Bundle,
true, SLP);
17179 initialFillReadyList(ReadyInsts);
17186 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17187 !ReadyInsts.empty()) {
17188 ScheduleData *Picked = ReadyInsts.pop_back_val();
17189 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17190 "must be ready to schedule");
17191 schedule(Picked, ReadyInsts);
17197 for (
Value *V : VL) {
17200 if (!extendSchedulingRegion(V, S)) {
17207 TryScheduleBundleImpl(
false,
nullptr);
17208 return std::nullopt;
17212 bool ReSchedule =
false;
17213 for (
Value *V : VL) {
17216 ScheduleData *BundleMember = getScheduleData(V);
17218 "no ScheduleData for bundle member (maybe not in same basic block)");
17222 ReadyInsts.remove(BundleMember);
17224 if (!BundleMember->IsScheduled)
17229 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17230 <<
" was already scheduled\n");
17234 auto *Bundle = buildBundle(VL);
17235 TryScheduleBundleImpl(ReSchedule, Bundle);
17236 if (!Bundle->isReady()) {
17237 cancelScheduling(VL, S.getMainOp());
17238 return std::nullopt;
17251 ScheduleData *Bundle = getScheduleData(OpValue);
17252 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17253 assert(!Bundle->IsScheduled &&
17254 "Can't cancel bundle which is already scheduled");
17255 assert(Bundle->isSchedulingEntity() &&
17257 "tried to unbundle something which is not a bundle");
17260 if (Bundle->isReady())
17261 ReadyInsts.remove(Bundle);
17264 ScheduleData *BundleMember = Bundle;
17265 while (BundleMember) {
17266 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17267 BundleMember->FirstInBundle = BundleMember;
17268 ScheduleData *Next = BundleMember->NextInBundle;
17269 BundleMember->NextInBundle =
nullptr;
17270 BundleMember->TE =
nullptr;
17271 if (BundleMember->unscheduledDepsInBundle() == 0) {
17272 ReadyInsts.insert(BundleMember);
17274 BundleMember = Next;
17278BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17280 if (ChunkPos >= ChunkSize) {
17281 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17284 return &(ScheduleDataChunks.back()[ChunkPos++]);
17287bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17288 Value *V,
const InstructionsState &S) {
17290 assert(
I &&
"bundle member must be an instruction");
17293 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17295 if (getScheduleData(
I))
17297 if (!ScheduleStart) {
17299 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17301 ScheduleEnd =
I->getNextNode();
17302 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17303 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17311 ++ScheduleStart->getIterator().getReverse();
17316 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17317 return II->isAssumeLikeIntrinsic();
17320 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17321 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17322 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17324 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17325 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17332 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17333 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17335 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17336 assert(
I->getParent() == ScheduleStart->getParent() &&
17337 "Instruction is in wrong basic block.");
17338 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17344 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17345 "Expected to reach top of the basic block or instruction down the "
17347 assert(
I->getParent() == ScheduleEnd->getParent() &&
17348 "Instruction is in wrong basic block.");
17349 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17351 ScheduleEnd =
I->getNextNode();
17352 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17353 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17357void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17359 ScheduleData *PrevLoadStore,
17360 ScheduleData *NextLoadStore) {
17361 ScheduleData *CurrentLoadStore = PrevLoadStore;
17366 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17368 SD = allocateScheduleDataChunks();
17369 ScheduleDataMap[
I] = SD;
17371 assert(!isInSchedulingRegion(SD) &&
17372 "new ScheduleData already in scheduling region");
17373 SD->init(SchedulingRegionID,
I);
17375 if (
I->mayReadOrWriteMemory() &&
17376 (!isa<IntrinsicInst>(
I) ||
17377 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17378 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17379 Intrinsic::pseudoprobe))) {
17381 if (CurrentLoadStore) {
17382 CurrentLoadStore->NextLoadStore = SD;
17384 FirstLoadStoreInRegion = SD;
17386 CurrentLoadStore = SD;
17389 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17390 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17391 RegionHasStackSave =
true;
17393 if (NextLoadStore) {
17394 if (CurrentLoadStore)
17395 CurrentLoadStore->NextLoadStore = NextLoadStore;
17397 LastLoadStoreInRegion = CurrentLoadStore;
17401void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17402 bool InsertInReadyList,
17404 assert(SD->isSchedulingEntity());
17409 while (!WorkList.
empty()) {
17411 for (ScheduleData *BundleMember = SD; BundleMember;
17412 BundleMember = BundleMember->NextInBundle) {
17413 assert(isInSchedulingRegion(BundleMember));
17414 if (BundleMember->hasValidDependencies())
17419 BundleMember->Dependencies = 0;
17420 BundleMember->resetUnscheduledDeps();
17423 for (
User *U : BundleMember->Inst->
users()) {
17424 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17425 BundleMember->Dependencies++;
17426 ScheduleData *DestBundle = UseSD->FirstInBundle;
17427 if (!DestBundle->IsScheduled)
17428 BundleMember->incrementUnscheduledDeps(1);
17429 if (!DestBundle->hasValidDependencies())
17435 auto *DepDest = getScheduleData(
I);
17436 assert(DepDest &&
"must be in schedule window");
17437 DepDest->ControlDependencies.push_back(BundleMember);
17438 BundleMember->Dependencies++;
17439 ScheduleData *DestBundle = DepDest->FirstInBundle;
17440 if (!DestBundle->IsScheduled)
17441 BundleMember->incrementUnscheduledDeps(1);
17442 if (!DestBundle->hasValidDependencies())
17450 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17451 I != ScheduleEnd;
I =
I->getNextNode()) {
17456 MakeControlDependent(
I);
17464 if (RegionHasStackSave) {
17468 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17469 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17470 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17471 I != ScheduleEnd;
I =
I->getNextNode()) {
17472 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17473 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17478 if (!isa<AllocaInst>(
I))
17482 MakeControlDependent(
I);
17491 if (isa<AllocaInst>(BundleMember->Inst) ||
17492 BundleMember->Inst->mayReadOrWriteMemory()) {
17493 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17494 I != ScheduleEnd;
I =
I->getNextNode()) {
17495 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17496 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17500 MakeControlDependent(
I);
17507 ScheduleData *DepDest = BundleMember->NextLoadStore;
17512 "NextLoadStore list for non memory effecting bundle?");
17514 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17515 unsigned NumAliased = 0;
17516 unsigned DistToSrc = 1;
17518 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17519 assert(isInSchedulingRegion(DepDest));
17529 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17531 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17538 DepDest->MemoryDependencies.push_back(BundleMember);
17539 BundleMember->Dependencies++;
17540 ScheduleData *DestBundle = DepDest->FirstInBundle;
17541 if (!DestBundle->IsScheduled) {
17542 BundleMember->incrementUnscheduledDeps(1);
17544 if (!DestBundle->hasValidDependencies()) {
17567 if (InsertInReadyList && SD->isReady()) {
17568 ReadyInsts.insert(SD);
17575void BoUpSLP::BlockScheduling::resetSchedule() {
17577 "tried to reset schedule on block which has not been scheduled");
17578 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17579 if (ScheduleData *SD = getScheduleData(
I)) {
17580 assert(isInSchedulingRegion(SD) &&
17581 "ScheduleData not in scheduling region");
17582 SD->IsScheduled =
false;
17583 SD->resetUnscheduledDeps();
17586 ReadyInsts.clear();
17589void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17590 if (!BS->ScheduleStart)
17593 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17600 BS->resetSchedule();
17607 struct ScheduleDataCompare {
17608 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17609 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17612 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17617 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17618 I =
I->getNextNode()) {
17619 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17620 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);
17622 SD->isPartOfBundle() ==
17624 "scheduler and vectorizer bundle mismatch");
17625 SD->FirstInBundle->SchedulingPriority =
Idx++;
17627 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17628 BS->calculateDependencies(SD,
false,
this);
17631 BS->initialFillReadyList(ReadyInsts);
17633 Instruction *LastScheduledInst = BS->ScheduleEnd;
17636 while (!ReadyInsts.empty()) {
17637 ScheduleData *Picked = *ReadyInsts.begin();
17638 ReadyInsts.erase(ReadyInsts.begin());
17642 for (ScheduleData *BundleMember = Picked; BundleMember;
17643 BundleMember = BundleMember->NextInBundle) {
17647 LastScheduledInst = PickedInst;
17650 BS->schedule(Picked, ReadyInsts);
17654#ifdef EXPENSIVE_CHECKS
17658#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17660 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17661 ScheduleData *SD = BS->getScheduleData(
I);
17662 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17663 assert(SD->IsScheduled &&
"must be scheduled at this point");
17668 BS->ScheduleStart =
nullptr;
17675 if (
auto *Store = dyn_cast<StoreInst>(V))
17676 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17678 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17681 auto E = InstrElementSize.
find(V);
17682 if (E != InstrElementSize.
end())
17691 if (
auto *
I = dyn_cast<Instruction>(V)) {
17699 Value *FirstNonBool =
nullptr;
17700 while (!Worklist.
empty()) {
17705 auto *Ty =
I->getType();
17706 if (isa<VectorType>(Ty))
17708 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17715 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17716 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17724 for (
Use &U :
I->operands()) {
17725 if (
auto *J = dyn_cast<Instruction>(U.get()))
17726 if (Visited.
insert(J).second &&
17727 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17731 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17732 FirstNonBool = U.get();
17743 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17745 Width =
DL->getTypeSizeInBits(V->getType());
17749 InstrElementSize[
I] = Width;
17754bool BoUpSLP::collectValuesToDemote(
17755 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17758 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17760 if (
all_of(E.Scalars, IsaPred<Constant>))
17763 unsigned OrigBitWidth =
17764 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17771 if (NodesToKeepBWs.
contains(E.Idx))
17777 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17778 if (isa<PoisonValue>(R))
17780 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17782 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17783 if (isa<PoisonValue>(V))
17791 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17797 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17800 if (
auto *
I = dyn_cast<Instruction>(V)) {
17802 unsigned BitWidth2 =
17803 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17804 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17810 BitWidth1 = std::min(BitWidth1, BitWidth2);
17815 auto FinalAnalysis = [&,
TTI =
TTI]() {
17816 if (!IsProfitableToDemote)
17819 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17821 if (Res && E.isGather()) {
17825 for (
Value *V : E.Scalars) {
17826 auto *EE = dyn_cast<ExtractElementInst>(V);
17829 UniqueBases.
insert(EE->getVectorOperand());
17831 const unsigned VF = E.Scalars.size();
17832 Type *OrigScalarTy = E.Scalars.front()->getType();
17833 if (UniqueBases.
size() <= 2 ||
17841 if (E.isGather() || !Visited.
insert(&E).second ||
17843 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17844 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17847 return FinalAnalysis();
17850 return !all_of(V->users(), [=](User *U) {
17851 return getTreeEntry(U) ||
17852 (E.Idx == 0 && UserIgnoreList &&
17853 UserIgnoreList->contains(U)) ||
17854 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17855 !U->getType()->isScalableTy() &&
17856 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17857 }) && !IsPotentiallyTruncated(V,
BitWidth);
17862 bool &NeedToExit) {
17863 NeedToExit =
false;
17864 unsigned InitLevel = MaxDepthLevel;
17866 unsigned Level = InitLevel;
17867 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17868 ToDemote, Visited, NodesToKeepBWs, Level,
17869 IsProfitableToDemote, IsTruncRoot)) {
17870 if (!IsProfitableToDemote)
17873 if (!FinalAnalysis())
17877 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17881 auto AttemptCheckBitwidth =
17884 NeedToExit =
false;
17885 unsigned BestFailBitwidth = 0;
17887 if (Checker(
BitWidth, OrigBitWidth))
17889 if (BestFailBitwidth == 0 && FinalAnalysis())
17893 if (BestFailBitwidth == 0) {
17904 auto TryProcessInstruction =
17910 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17915 if (E.UserTreeIndices.size() > 1 &&
17916 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17919 bool NeedToExit =
false;
17920 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17924 if (!ProcessOperands(
Operands, NeedToExit))
17933 return IsProfitableToDemote;
17935 switch (E.getOpcode()) {
17939 case Instruction::Trunc:
17940 if (IsProfitableToDemoteRoot)
17941 IsProfitableToDemote =
true;
17942 return TryProcessInstruction(
BitWidth);
17943 case Instruction::ZExt:
17944 case Instruction::SExt:
17945 IsProfitableToDemote =
true;
17946 return TryProcessInstruction(
BitWidth);
17950 case Instruction::Add:
17951 case Instruction::Sub:
17952 case Instruction::Mul:
17953 case Instruction::And:
17954 case Instruction::Or:
17955 case Instruction::Xor: {
17956 return TryProcessInstruction(
17957 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17959 case Instruction::Freeze:
17960 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17961 case Instruction::Shl: {
17966 if (isa<PoisonValue>(V))
17968 auto *I = cast<Instruction>(V);
17969 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17970 return AmtKnownBits.getMaxValue().ult(BitWidth);
17973 return TryProcessInstruction(
17974 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17976 case Instruction::LShr: {
17980 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17982 if (isa<PoisonValue>(V))
17984 auto *I = cast<Instruction>(V);
17985 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17986 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17987 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17988 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17989 SimplifyQuery(*DL));
17992 return TryProcessInstruction(
17993 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17996 case Instruction::AShr: {
18000 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18002 if (isa<PoisonValue>(V))
18004 auto *I = cast<Instruction>(V);
18005 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18006 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18007 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18008 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18012 return TryProcessInstruction(
18013 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18016 case Instruction::UDiv:
18017 case Instruction::URem: {
18019 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18022 auto *I = cast<Instruction>(V);
18023 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18024 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18025 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18028 return TryProcessInstruction(
18029 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18033 case Instruction::Select: {
18034 return TryProcessInstruction(
18035 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18040 case Instruction::PHI: {
18041 const unsigned NumOps = E.getNumOperands();
18044 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18046 return TryProcessInstruction(
BitWidth, Ops);
18049 case Instruction::Call: {
18050 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18054 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18055 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18059 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18062 auto *I = cast<Instruction>(V);
18063 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18064 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18065 return MaskedValueIsZero(I->getOperand(0), Mask,
18066 SimplifyQuery(*DL)) &&
18067 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18069 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18070 "Expected min/max intrinsics only.");
18071 unsigned SignBits = OrigBitWidth -
BitWidth;
18077 return SignBits <= Op0SignBits &&
18078 ((SignBits != Op0SignBits &&
18082 SignBits <= Op1SignBits &&
18083 ((SignBits != Op1SignBits &&
18088 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18091 auto *I = cast<Instruction>(V);
18092 unsigned SignBits = OrigBitWidth - BitWidth;
18093 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18094 unsigned Op0SignBits =
18095 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18096 return SignBits <= Op0SignBits &&
18097 ((SignBits != Op0SignBits &&
18098 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18099 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18102 if (
ID != Intrinsic::abs) {
18103 Operands.push_back(getOperandEntry(&E, 1));
18104 CallChecker = CompChecker;
18106 CallChecker = AbsChecker;
18109 std::numeric_limits<InstructionCost::CostType>::max();
18111 unsigned VF = E.Scalars.size();
18121 if (
Cost < BestCost) {
18127 [[maybe_unused]]
bool NeedToExit;
18128 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18138 return FinalAnalysis();
18145 bool IsStoreOrInsertElt =
18146 VectorizableTree.front()->hasState() &&
18147 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18148 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18149 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18150 ExtraBitWidthNodes.
size() <= 1 &&
18151 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18152 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18155 unsigned NodeIdx = 0;
18156 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18160 if (VectorizableTree[NodeIdx]->
isGather() ||
18161 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18162 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18164 return EI.
UserTE->Idx > NodeIdx;
18170 bool IsTruncRoot =
false;
18171 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18174 if (NodeIdx != 0 &&
18175 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18176 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18177 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18178 IsTruncRoot =
true;
18180 IsProfitableToDemoteRoot =
true;
18185 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18189 auto ComputeMaxBitWidth =
18190 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
18191 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
18195 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18196 !NodesToKeepBWs.
contains(E.Idx) &&
18197 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18199 return V->hasOneUse() || isa<Constant>(V) ||
18202 const TreeEntry *TE = getTreeEntry(U);
18203 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18204 if (TE == UserTE || !TE)
18206 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18208 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18209 SelectInst>(UserTE->getMainOp()))
18211 unsigned UserTESz = DL->getTypeSizeInBits(
18212 UserTE->Scalars.front()->getType());
18213 auto It = MinBWs.find(TE);
18214 if (It != MinBWs.end() && It->second.first > UserTESz)
18216 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18220 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18221 auto It = MinBWs.
find(UserTE);
18222 if (It != MinBWs.
end())
18223 return It->second.first;
18224 unsigned MaxBitWidth =
18225 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18226 MaxBitWidth =
bit_ceil(MaxBitWidth);
18227 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18229 return MaxBitWidth;
18235 unsigned VF = E.getVectorFactor();
18236 Type *ScalarTy = E.Scalars.front()->getType();
18238 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18243 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18252 unsigned MaxBitWidth = 1u;
18260 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18261 if (isa<PoisonValue>(R))
18263 KnownBits Known = computeKnownBits(R, *DL);
18264 return Known.isNonNegative();
18269 for (
Value *Root : E.Scalars) {
18270 if (isa<PoisonValue>(Root))
18275 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18291 if (!IsKnownPositive)
18295 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18297 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18300 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18305 if (NumParts > 1 &&
18311 unsigned Opcode = E.getOpcode();
18312 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18313 Opcode == Instruction::SExt ||
18314 Opcode == Instruction::ZExt || NumParts > 1;
18319 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18320 bool NeedToDemote = IsProfitableToDemote;
18322 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18323 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18324 NeedToDemote, IsTruncRoot) ||
18325 (MaxDepthLevel <= Limit &&
18326 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18327 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18328 DL->getTypeSizeInBits(TreeRootIT) /
18329 DL->getTypeSizeInBits(
18330 E.getMainOp()->getOperand(0)->getType()) >
18334 MaxBitWidth =
bit_ceil(MaxBitWidth);
18336 return MaxBitWidth;
18343 if (UserIgnoreList &&
18344 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18347 if (
all_of(*UserIgnoreList,
18349 return isa<PoisonValue>(V) ||
18350 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18352 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18353 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18354 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18356 ReductionBitWidth = 1;
18358 for (
Value *V : *UserIgnoreList) {
18359 if (isa<PoisonValue>(V))
18362 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18363 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18366 unsigned BitWidth2 = BitWidth1;
18369 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18371 ReductionBitWidth =
18372 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18374 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18375 ReductionBitWidth = 8;
18377 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18380 bool IsTopRoot = NodeIdx == 0;
18381 while (NodeIdx < VectorizableTree.size() &&
18382 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18383 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18386 IsTruncRoot =
true;
18388 bool IsSignedCmp =
false;
18389 while (NodeIdx < VectorizableTree.size()) {
18391 unsigned Limit = 2;
18393 ReductionBitWidth ==
18394 DL->getTypeSizeInBits(
18395 VectorizableTree.front()->Scalars.front()->getType()))
18397 unsigned MaxBitWidth = ComputeMaxBitWidth(
18398 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18399 IsTruncRoot, IsSignedCmp);
18400 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18401 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18402 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18403 else if (MaxBitWidth == 0)
18404 ReductionBitWidth = 0;
18407 for (
unsigned Idx : RootDemotes) {
18410 DL->getTypeSizeInBits(V->getType()->getScalarType());
18411 if (OrigBitWidth > MaxBitWidth) {
18419 RootDemotes.clear();
18421 IsProfitableToDemoteRoot =
true;
18423 if (ExtraBitWidthNodes.
empty()) {
18424 NodeIdx = VectorizableTree.size();
18426 unsigned NewIdx = 0;
18428 NewIdx = *ExtraBitWidthNodes.
begin();
18429 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18430 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18433 NodeIdx < VectorizableTree.size() &&
18434 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18437 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18438 !EI.
UserTE->isAltShuffle();
18441 NodeIdx < VectorizableTree.size() &&
18443 VectorizableTree[NodeIdx]->UserTreeIndices,
18445 return (EI.
UserTE->hasState() &&
18446 EI.
UserTE->getOpcode() == Instruction::ICmp) &&
18448 auto *IC = dyn_cast<ICmpInst>(V);
18451 !isKnownNonNegative(IC->getOperand(0),
18452 SimplifyQuery(*DL)) ||
18453 !isKnownNonNegative(IC->getOperand(1),
18454 SimplifyQuery(*DL)));
18461 if (MaxBitWidth == 0 ||
18463 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18465 if (UserIgnoreList)
18473 for (
unsigned Idx : ToDemote) {
18474 TreeEntry *TE = VectorizableTree[
Idx].get();
18477 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18478 if (isa<PoisonValue>(R))
18480 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18498 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18523 DL = &
F.getDataLayout();
18527 bool Changed =
false;
18533 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18538 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18541 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18545 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18554 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18559 R.clearReductionData();
18560 collectSeedInstructions(BB);
18563 if (!Stores.
empty()) {
18565 <<
" underlying objects.\n");
18566 Changed |= vectorizeStoreChains(R);
18570 Changed |= vectorizeChainsInBlock(BB, R);
18575 if (!GEPs.
empty()) {
18577 <<
" underlying objects.\n");
18578 Changed |= vectorizeGEPIndices(BB, R);
18583 R.optimizeGatherSequence();
18591 unsigned Idx,
unsigned MinVF,
18596 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18597 unsigned VF = Chain.
size();
18601 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18603 VF < 2 || VF < MinVF) {
18615 for (
Value *V : Chain)
18616 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18619 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18621 bool IsAllowedSize =
18625 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18626 (!S.getMainOp()->isSafeToRemove() ||
18629 return !isa<ExtractElementInst>(V) &&
18630 (V->getNumUses() > Chain.size() ||
18631 any_of(V->users(), [&](User *U) {
18632 return !Stores.contains(U);
18635 (ValOps.
size() > Chain.size() / 2 && !S)) {
18636 Size = (!IsAllowedSize && S) ? 1 : 2;
18640 if (
R.isLoadCombineCandidate(Chain))
18642 R.buildTree(Chain);
18644 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18645 if (
R.isGathered(Chain.front()) ||
18646 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18647 return std::nullopt;
18648 Size =
R.getCanonicalGraphSize();
18651 R.reorderTopToBottom();
18652 R.reorderBottomToTop();
18653 R.transformNodes();
18654 R.buildExternalUses();
18656 R.computeMinimumValueSizes();
18658 Size =
R.getCanonicalGraphSize();
18659 if (S && S.getOpcode() == Instruction::Load)
18667 using namespace ore;
18670 cast<StoreInst>(Chain[0]))
18671 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18672 <<
" and with tree size "
18673 <<
NV(
"TreeSize",
R.getTreeSize()));
18687 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18688 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18689 unsigned Size = First ? Val.first : Val.second;
18701 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18702 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18703 unsigned P = First ? Val.first : Val.second;
18706 return V + (P - Mean) * (P - Mean);
18709 return Dev * 81 / (Mean * Mean) == 0;
18712bool SLPVectorizerPass::vectorizeStores(
18714 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18719 bool Changed =
false;
18721 struct StoreDistCompare {
18722 bool operator()(
const std::pair<unsigned, int> &Op1,
18723 const std::pair<unsigned, int> &Op2)
const {
18724 return Op1.second < Op2.second;
18729 using StoreIndexToDistSet =
18730 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18731 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18736 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18738 PrevDist =
Data.second;
18739 if (
Idx !=
Set.size() - 1)
18744 Operands.push_back(Stores[DataVar.first]);
18745 PrevDist = DataVar.second;
18750 .
insert({Operands.front(),
18751 cast<StoreInst>(Operands.front())->getValueOperand(),
18753 cast<StoreInst>(Operands.back())->getValueOperand(),
18758 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18759 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18763 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18765 Type *StoreTy =
Store->getValueOperand()->getType();
18766 Type *ValueTy = StoreTy;
18767 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18768 ValueTy = Trunc->getSrcTy();
18769 unsigned MinVF = std::max<unsigned>(
18771 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18774 if (MaxVF < MinVF) {
18775 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18777 <<
"MinVF (" << MinVF <<
")\n");
18781 unsigned NonPowerOf2VF = 0;
18786 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18788 NonPowerOf2VF = CandVF;
18789 assert(NonPowerOf2VF != MaxVF &&
18790 "Non-power-of-2 VF should not be equal to MaxVF");
18794 unsigned MaxRegVF = MaxVF;
18796 if (MaxVF < MinVF) {
18797 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18799 <<
"MinVF (" << MinVF <<
")\n");
18805 unsigned Size = MinVF;
18807 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18811 unsigned Repeat = 0;
18812 constexpr unsigned MaxAttempts = 4;
18814 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18815 P.first =
P.second = 1;
18818 auto IsNotVectorized = [](
bool First,
18819 const std::pair<unsigned, unsigned> &
P) {
18820 return First ?
P.first > 0 :
P.second > 0;
18822 auto IsVectorized = [](
bool First,
18823 const std::pair<unsigned, unsigned> &
P) {
18824 return First ?
P.first == 0 :
P.second == 0;
18826 auto VFIsProfitable = [](
bool First,
unsigned Size,
18827 const std::pair<unsigned, unsigned> &
P) {
18830 auto FirstSizeSame = [](
unsigned Size,
18831 const std::pair<unsigned, unsigned> &
P) {
18832 return Size ==
P.first;
18836 bool RepeatChanged =
false;
18837 bool AnyProfitableGraph =
false;
18838 for (
unsigned Size : CandidateVFs) {
18839 AnyProfitableGraph =
false;
18840 unsigned StartIdx = std::distance(
18841 RangeSizes.begin(),
18842 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18843 std::placeholders::_1)));
18844 while (StartIdx <
End) {
18846 std::distance(RangeSizes.begin(),
18847 find_if(RangeSizes.drop_front(StartIdx),
18848 std::bind(IsVectorized,
Size >= MaxRegVF,
18849 std::placeholders::_1)));
18850 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18851 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18853 Size >= MaxRegVF)) {
18860 return cast<StoreInst>(V)
18861 ->getValueOperand()
18863 cast<StoreInst>(Slice.
front())
18864 ->getValueOperand()
18867 "Expected all operands of same type.");
18868 if (!NonSchedulable.empty()) {
18869 auto [NonSchedSizeMax, NonSchedSizeMin] =
18870 NonSchedulable.lookup(Slice.
front());
18871 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18872 Cnt += NonSchedSizeMax;
18877 std::optional<bool> Res =
18878 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18882 .first->getSecond()
18890 AnyProfitableGraph = RepeatChanged = Changed =
true;
18894 [](std::pair<unsigned, unsigned> &
P) {
18895 P.first = P.second = 0;
18897 if (Cnt < StartIdx + MinVF) {
18898 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18899 [](std::pair<unsigned, unsigned> &
P) {
18900 P.first = P.second = 0;
18902 StartIdx = Cnt +
Size;
18904 if (Cnt > Sz -
Size - MinVF) {
18906 [](std::pair<unsigned, unsigned> &
P) {
18907 P.first = P.second = 0;
18916 if (
Size > 2 && Res &&
18918 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18919 std::placeholders::_1))) {
18925 if (
Size > MaxRegVF && TreeSize > 1 &&
18927 std::bind(FirstSizeSame, TreeSize,
18928 std::placeholders::_1))) {
18930 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18936 [&](std::pair<unsigned, unsigned> &
P) {
18937 if (Size >= MaxRegVF)
18938 P.second = std::max(P.second, TreeSize);
18940 P.first = std::max(P.first, TreeSize);
18943 AnyProfitableGraph =
true;
18945 if (StartIdx >=
End)
18947 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18948 AnyProfitableGraph =
true;
18949 StartIdx = std::distance(
18950 RangeSizes.begin(),
18951 find_if(RangeSizes.drop_front(Sz),
18952 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18953 std::placeholders::_1)));
18959 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18960 return P.first == 0 &&
P.second == 0;
18964 if (Repeat >= MaxAttempts ||
18965 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18967 constexpr unsigned StoresLimit = 64;
18968 const unsigned MaxTotalNum = std::min<unsigned>(
18970 static_cast<unsigned>(
18973 RangeSizes.begin(),
18974 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18975 std::placeholders::_1))) +
18977 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18980 CandidateVFs.clear();
18982 CandidateVFs.push_back(Limit);
18983 if (VF > MaxTotalNum || VF >= StoresLimit)
18985 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18987 P.first = std::max(
P.second,
P.first);
18991 CandidateVFs.push_back(VF);
19038 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19040 Stores[
Set.first]->getValueOperand()->getType(),
19041 Stores[
Set.first]->getPointerOperand(),
19042 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19046 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19047 if (It ==
Set.second.end()) {
19048 Set.second.emplace(
Idx, *Diff);
19052 TryToVectorize(
Set.second);
19053 unsigned ItIdx = It->first;
19054 int ItDist = It->second;
19055 StoreIndexToDistSet PrevSet;
19056 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19057 [&](
const std::pair<unsigned, int> &Pair) {
19058 return Pair.first > ItIdx;
19060 Set.second.clear();
19062 Set.second.emplace(
Idx, 0);
19065 unsigned StartIdx = ItIdx + 1;
19070 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19072 if (VectorizedStores.
contains(Stores[Pair.first]))
19074 unsigned BI = Pair.first - StartIdx;
19075 UsedStores.set(BI);
19076 Dists[BI] = Pair.second - ItDist;
19078 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19079 unsigned BI =
I - StartIdx;
19080 if (UsedStores.test(BI))
19081 Set.second.emplace(
I, Dists[BI]);
19085 auto &Res = SortedStores.emplace_back();
19087 Res.second.emplace(
Idx, 0);
19089 Type *PrevValTy =
nullptr;
19091 if (
R.isDeleted(SI))
19094 PrevValTy =
SI->getValueOperand()->getType();
19096 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19097 for (
auto &Set : SortedStores)
19098 TryToVectorize(
Set.second);
19099 SortedStores.clear();
19100 PrevValTy =
SI->getValueOperand()->getType();
19102 FillStoresSet(
I, SI);
19106 for (
auto &Set : SortedStores)
19107 TryToVectorize(
Set.second);
19112void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19123 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19124 if (!
SI->isSimple())
19134 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19135 if (
GEP->getNumIndices() != 1)
19138 if (isa<Constant>(
Idx))
19142 if (
GEP->getType()->isVectorTy())
19154 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19155 << VL.
size() <<
".\n");
19166 for (
Value *V : VL) {
19167 Type *Ty =
V->getType();
19171 R.getORE()->emit([&]() {
19172 std::string TypeStr;
19176 <<
"Cannot SLP vectorize list: type "
19177 << TypeStr +
" is unsupported by vectorizer";
19184 unsigned Sz =
R.getVectorElementSize(I0);
19185 unsigned MinVF =
R.getMinVF(Sz);
19186 unsigned MaxVF = std::max<unsigned>(
19188 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19190 R.getORE()->emit([&]() {
19192 <<
"Cannot SLP vectorize list: vectorization factor "
19193 <<
"less than 2 is not supported";
19198 bool Changed =
false;
19199 bool CandidateFound =
false;
19202 unsigned NextInst = 0, MaxInst = VL.size();
19203 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19211 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19212 unsigned ActualVF = std::min(MaxInst -
I, VF);
19217 if (MaxVFOnly && ActualVF < MaxVF)
19219 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19224 for (
Value *V : VL.drop_front(
I)) {
19227 if (
auto *Inst = dyn_cast<Instruction>(V);
19228 !Inst || !
R.isDeleted(Inst)) {
19231 if (
Idx == ActualVF)
19236 if (
Idx != ActualVF)
19239 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19243 if (
R.isTreeTinyAndNotFullyVectorizable())
19245 R.reorderTopToBottom();
19246 R.reorderBottomToTop(
19247 !isa<InsertElementInst>(Ops.
front()) &&
19248 !
R.doesRootHaveInTreeUses());
19249 R.transformNodes();
19250 R.buildExternalUses();
19252 R.computeMinimumValueSizes();
19254 CandidateFound =
true;
19255 MinCost = std::min(MinCost,
Cost);
19258 <<
" for VF=" << ActualVF <<
"\n");
19262 cast<Instruction>(Ops[0]))
19263 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19264 <<
" and with tree size "
19265 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19276 if (!Changed && CandidateFound) {
19277 R.getORE()->emit([&]() {
19279 <<
"List vectorization was possible but not beneficial with cost "
19280 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19283 }
else if (!Changed) {
19284 R.getORE()->emit([&]() {
19286 <<
"Cannot SLP vectorize list: vectorization was impossible"
19287 <<
" with available vectorization factors";
19297 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19303 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19304 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19305 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19306 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19313 auto *
A = dyn_cast<BinaryOperator>(Op0);
19314 auto *
B = dyn_cast<BinaryOperator>(Op1);
19316 if (
A &&
B &&
B->hasOneUse()) {
19317 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19318 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19319 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19321 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19325 if (
B &&
A &&
A->hasOneUse()) {
19326 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19327 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19328 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19330 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19334 if (Candidates.
size() == 1)
19335 return tryToVectorizeList({Op0, Op1},
R);
19338 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19339 if (!BestCandidate)
19341 return tryToVectorizeList(
19342 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19376 ReductionOpsListType ReductionOps;
19386 bool IsSupportedHorRdxIdentityOp =
false;
19397 return isa<SelectInst>(
I) &&
19403 if (Kind == RecurKind::None)
19411 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19415 return I->getFastMathFlags().noNaNs();
19418 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19421 return I->isAssociative();
19430 return I->getOperand(2);
19431 return I->getOperand(
Index);
19438 case RecurKind::Or: {
19446 case RecurKind::And: {
19454 case RecurKind::Add:
19455 case RecurKind::Mul:
19456 case RecurKind::Xor:
19457 case RecurKind::FAdd:
19458 case RecurKind::FMul: {
19463 case RecurKind::SMax:
19464 case RecurKind::SMin:
19465 case RecurKind::UMax:
19466 case RecurKind::UMin:
19473 case RecurKind::FMax:
19474 case RecurKind::FMin:
19475 case RecurKind::FMaximum:
19476 case RecurKind::FMinimum: {
19489 const ReductionOpsListType &ReductionOps) {
19490 bool UseSelect = ReductionOps.size() == 2 ||
19492 (ReductionOps.size() == 1 &&
19493 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19494 assert((!UseSelect || ReductionOps.size() != 2 ||
19495 isa<SelectInst>(ReductionOps[1][0])) &&
19496 "Expected cmp + select pairs for reduction");
19499 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19513 auto *
I = dyn_cast<Instruction>(V);
19515 return RecurKind::None;
19517 return RecurKind::Add;
19519 return RecurKind::Mul;
19522 return RecurKind::And;
19525 return RecurKind::Or;
19527 return RecurKind::Xor;
19529 return RecurKind::FAdd;
19531 return RecurKind::FMul;
19534 return RecurKind::FMax;
19536 return RecurKind::FMin;
19539 return RecurKind::FMaximum;
19541 return RecurKind::FMinimum;
19547 return RecurKind::SMax;
19549 return RecurKind::SMin;
19551 return RecurKind::UMax;
19553 return RecurKind::UMin;
19555 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19577 if (!isa<ExtractElementInst>(
RHS) ||
19579 return RecurKind::None;
19581 if (!isa<ExtractElementInst>(
LHS) ||
19583 return RecurKind::None;
19585 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19586 return RecurKind::None;
19590 return RecurKind::None;
19595 return RecurKind::None;
19598 return RecurKind::SMax;
19601 return RecurKind::SMin;
19604 return RecurKind::UMax;
19607 return RecurKind::UMin;
19610 return RecurKind::None;
19614 static unsigned getFirstOperandIndex(
Instruction *
I) {
19615 return isCmpSelMinMax(
I) ? 1 : 0;
19621 return isCmpSelMinMax(
I) ? 3 : 2;
19627 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19628 auto *Sel = cast<SelectInst>(
I);
19629 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19630 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19632 return I->getParent() == BB;
19636 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19637 if (IsCmpSelMinMax) {
19640 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19641 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19642 return I->hasNUses(2);
19646 return I->hasOneUse();
19651 if (isCmpSelMinMax(
I))
19652 ReductionOps.assign(2, ReductionOpsType());
19654 ReductionOps.assign(1, ReductionOpsType());
19659 if (isCmpSelMinMax(
I)) {
19660 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19661 ReductionOps[1].emplace_back(
I);
19663 ReductionOps[0].emplace_back(
I);
19668 int Sz = Data.size();
19669 auto *
I = dyn_cast<Instruction>(Data.front());
19670 return Sz > 1 ||
isConstant(Data.front()) ||
19681 RdxKind = HorizontalReduction::getRdxKind(Root);
19682 if (!isVectorizable(RdxKind, Root))
19693 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19694 if (!Sel->getCondition()->hasOneUse())
19697 ReductionRoot = Root;
19702 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19704 1, std::make_pair(Root, 0));
19712 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19713 getNumberOfOperands(TreeN)))) {
19714 Value *EdgeVal = getRdxOperand(TreeN,
I);
19715 ReducedValsToOps[EdgeVal].push_back(TreeN);
19716 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19723 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19724 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19725 !isVectorizable(RdxKind, EdgeInst) ||
19726 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19727 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19728 PossibleReducedVals.push_back(EdgeVal);
19731 ReductionOps.push_back(EdgeInst);
19742 PossibleReducedVals;
19743 initReductionOps(Root);
19747 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19751 if (!LoadKeyUsed.
insert(Key).second) {
19752 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19753 if (LIt != LoadsMap.
end()) {
19754 for (
LoadInst *RLI : LIt->second) {
19760 for (
LoadInst *RLI : LIt->second) {
19767 if (LIt->second.size() > 2) {
19769 hash_value(LIt->second.back()->getPointerOperand());
19775 .first->second.push_back(LI);
19779 while (!Worklist.empty()) {
19780 auto [TreeN, Level] = Worklist.pop_back_val();
19783 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19784 addReductionOps(TreeN);
19787 for (
Value *V : PossibleRedVals) {
19791 ++PossibleReducedVals[
Key][
Idx]
19792 .
insert(std::make_pair(V, 0))
19796 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19798 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19801 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19802 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19804 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19807 auto RedValsVect = It->second.takeVector();
19809 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19810 PossibleRedValsVect.
back().append(Data.second, Data.first);
19812 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19813 return P1.size() > P2.size();
19818 (!isGoodForReduction(Data) &&
19819 (!isa<LoadInst>(Data.front()) ||
19820 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19822 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19824 cast<LoadInst>(ReducedVals[NewIdx].front())
19826 NewIdx = ReducedVals.
size();
19829 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19835 return P1.size() > P2.
size();
19844 constexpr unsigned RegMaxNumber = 4;
19845 constexpr unsigned RedValsMaxNumber = 128;
19849 if (
unsigned NumReducedVals = std::accumulate(
19850 ReducedVals.
begin(), ReducedVals.
end(), 0,
19852 if (!isGoodForReduction(Vals))
19854 return Num + Vals.size();
19856 NumReducedVals < ReductionLimit &&
19860 for (ReductionOpsType &RdxOps : ReductionOps)
19861 for (
Value *RdxOp : RdxOps)
19862 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19873 ReducedVals.
front().size());
19877 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19878 assert(isa<SelectInst>(RdxRootInst) &&
19879 "Expected min/max reduction to have select root instruction");
19880 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19881 assert(isa<Instruction>(ScalarCond) &&
19882 "Expected min/max reduction to have compare condition");
19883 return cast<Instruction>(ScalarCond);
19886 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19887 return isBoolLogicOp(cast<Instruction>(V));
19890 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19891 if (VectorizedTree) {
19894 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19895 if (AnyBoolLogicOp) {
19896 auto It = ReducedValsToOps.
find(VectorizedTree);
19897 auto It1 = ReducedValsToOps.
find(Res);
19898 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19900 (It != ReducedValsToOps.
end() &&
19902 return isBoolLogicOp(I) &&
19903 getRdxOperand(I, 0) == VectorizedTree;
19907 (It1 != ReducedValsToOps.
end() &&
19909 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19913 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19917 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19924 ReductionOps.front().size());
19925 for (ReductionOpsType &RdxOps : ReductionOps)
19926 for (
Value *RdxOp : RdxOps) {
19929 IgnoreList.insert(RdxOp);
19934 for (
Value *U : IgnoreList)
19935 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19936 RdxFMF &= FPMO->getFastMathFlags();
19937 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19942 for (
Value *V : Candidates)
19943 TrackedVals.try_emplace(V, V);
19946 Value *
V) ->
unsigned & {
19947 auto *It = MV.
find(V);
19948 assert(It != MV.
end() &&
"Unable to find given key.");
19957 bool CheckForReusedReductionOps =
false;
19962 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19964 InstructionsState S = States[
I];
19968 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19969 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19974 auto *Inst = dyn_cast<Instruction>(RdxVal);
19976 (!S || !S.isOpcodeOrAlt(Inst))) ||
19980 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19982 bool ShuffledExtracts =
false;
19984 if (S && S.getOpcode() == Instruction::ExtractElement &&
19985 !S.isAltShuffle() &&
I + 1 <
E) {
19987 for (
Value *RV : ReducedVals[
I + 1]) {
19988 Value *RdxVal = TrackedVals.at(RV);
19992 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19995 CommonCandidates.push_back(RdxVal);
19996 TrackedToOrig.try_emplace(RdxVal, RV);
20001 Candidates.
swap(CommonCandidates);
20002 ShuffledExtracts =
true;
20009 Value *OrigV = TrackedToOrig.at(Candidates.
front());
20010 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20012 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
20013 Value *OrigV = TrackedToOrig.at(VC);
20014 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20015 if (
auto *ResI = dyn_cast<Instruction>(Res))
20016 V.analyzedReductionRoot(ResI);
20018 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20022 unsigned NumReducedVals = Candidates.
size();
20023 if (NumReducedVals < ReductionLimit &&
20024 (NumReducedVals < 2 || !
isSplat(Candidates)))
20029 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20030 RdxKind != RecurKind::FMul &&
20031 RdxKind != RecurKind::FMulAdd;
20034 if (IsSupportedHorRdxIdentityOp)
20035 for (
Value *V : Candidates) {
20036 Value *OrigV = TrackedToOrig.at(V);
20037 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20049 bool SameScaleFactor =
false;
20050 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20051 SameValuesCounter.
size() != Candidates.size();
20053 if (OptReusedScalars) {
20055 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20056 RdxKind == RecurKind::Xor) &&
20058 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20059 return P.second == SameValuesCounter.
front().second;
20061 Candidates.resize(SameValuesCounter.
size());
20062 transform(SameValuesCounter, Candidates.begin(),
20063 [&](
const auto &
P) { return TrackedVals.at(P.first); });
20064 NumReducedVals = Candidates.size();
20066 if (NumReducedVals == 1) {
20067 Value *OrigV = TrackedToOrig.at(Candidates.front());
20068 unsigned Cnt = At(SameValuesCounter, OrigV);
20070 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20071 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20072 VectorizedVals.try_emplace(OrigV, Cnt);
20073 ExternallyUsedValues.
insert(OrigV);
20078 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20079 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20080 const unsigned MaxElts = std::clamp<unsigned>(
20082 RegMaxNumber * RedValsMaxNumber);
20084 unsigned ReduxWidth = NumReducedVals;
20085 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20086 unsigned NumParts, NumRegs;
20087 Type *ScalarTy = Candidates.front()->getType();
20094 while (NumParts > NumRegs) {
20095 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
20096 ReduxWidth =
bit_floor(ReduxWidth - 1);
20102 if (NumParts > NumRegs / 2)
20107 ReduxWidth = GetVectorFactor(ReduxWidth);
20108 ReduxWidth = std::min(ReduxWidth, MaxElts);
20110 unsigned Start = 0;
20111 unsigned Pos = Start;
20113 unsigned PrevReduxWidth = ReduxWidth;
20114 bool CheckForReusedReductionOpsLocal =
false;
20115 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20116 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20117 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20120 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20123 if (Pos < NumReducedVals - ReduxWidth + 1)
20124 return IsAnyRedOpGathered;
20127 if (ReduxWidth > 1)
20128 ReduxWidth = GetVectorFactor(ReduxWidth);
20129 return IsAnyRedOpGathered;
20131 bool AnyVectorized =
false;
20133 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20134 ReduxWidth >= ReductionLimit) {
20137 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20139 CheckForReusedReductionOps =
true;
20142 PrevReduxWidth = ReduxWidth;
20145 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20148 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20150 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20152 V.areAnalyzedReductionVals(VL)) {
20153 (void)AdjustReducedVals(
true);
20159 auto *RedValI = dyn_cast<Instruction>(RedVal);
20162 return V.isDeleted(RedValI);
20165 V.buildTree(VL, IgnoreList);
20166 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20167 if (!AdjustReducedVals())
20168 V.analyzedReductionVals(VL);
20171 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20172 if (!AdjustReducedVals())
20173 V.analyzedReductionVals(VL);
20176 V.reorderTopToBottom();
20178 V.reorderBottomToTop(
true);
20182 ExternallyUsedValues);
20186 LocalExternallyUsedValues.insert(ReductionRoot);
20187 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20188 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20190 for (
Value *V : ReducedVals[Cnt])
20191 if (isa<Instruction>(V))
20192 LocalExternallyUsedValues.insert(TrackedVals[V]);
20194 if (!IsSupportedHorRdxIdentityOp) {
20197 "Reused values counter map is not empty");
20198 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20199 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20201 Value *
V = Candidates[Cnt];
20202 Value *OrigV = TrackedToOrig.at(V);
20203 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20206 V.transformNodes();
20210 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20211 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20213 Value *RdxVal = Candidates[Cnt];
20214 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20215 RdxVal = It->second;
20216 if (!Visited.
insert(RdxVal).second)
20220 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20221 LocalExternallyUsedValues.insert(RdxVal);
20224 Value *OrigV = TrackedToOrig.at(RdxVal);
20226 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20227 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20228 LocalExternallyUsedValues.insert(RdxVal);
20231 if (!IsSupportedHorRdxIdentityOp)
20232 SameValuesCounter.
clear();
20233 for (
Value *RdxVal : VL)
20234 if (RequiredExtract.
contains(RdxVal))
20235 LocalExternallyUsedValues.insert(RdxVal);
20236 V.buildExternalUses(LocalExternallyUsedValues);
20238 V.computeMinimumValueSizes();
20243 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20246 <<
" for reduction\n");
20250 V.getORE()->emit([&]() {
20252 ReducedValsToOps.
at(VL[0]).front())
20253 <<
"Vectorizing horizontal reduction is possible "
20254 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20255 <<
" and threshold "
20258 if (!AdjustReducedVals()) {
20259 V.analyzedReductionVals(VL);
20260 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20261 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20264 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20265 VF >= ReductionLimit;
20267 *
TTI, VL.front()->getType(), VF - 1)) {
20269 V.getCanonicalGraphSize() !=
V.getTreeSize())
20271 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20279 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20280 <<
Cost <<
". (HorRdx)\n");
20281 V.getORE()->emit([&]() {
20283 ReducedValsToOps.
at(VL[0]).front())
20284 <<
"Vectorized horizontal reduction with cost "
20285 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20286 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20293 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20295 if (IsCmpSelMinMax)
20296 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20299 Value *VectorizedRoot =
20300 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20303 for (
Value *RdxVal : Candidates) {
20304 Value *OrigVal = TrackedToOrig.at(RdxVal);
20305 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20306 if (TransformedRdxVal != RdxVal)
20307 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20316 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20319 if (OptReusedScalars && !SameScaleFactor) {
20320 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20321 SameValuesCounter, TrackedToOrig);
20324 Value *ReducedSubTree;
20325 Type *ScalarTy = VL.front()->getType();
20326 if (isa<FixedVectorType>(ScalarTy)) {
20331 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20349 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20352 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20355 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20356 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20357 "Expected different reduction type.");
20359 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20360 V.isSignedMinBitwidthRootNode());
20366 if (OptReusedScalars && SameScaleFactor)
20367 ReducedSubTree = emitScaleForReusedOps(
20368 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20370 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20372 for (
Value *RdxVal : VL) {
20373 Value *OrigV = TrackedToOrig.at(RdxVal);
20374 if (IsSupportedHorRdxIdentityOp) {
20375 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20378 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20379 if (!
V.isVectorized(RdxVal))
20380 RequiredExtract.
insert(RdxVal);
20384 ReduxWidth = NumReducedVals - Pos;
20385 if (ReduxWidth > 1)
20386 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20387 AnyVectorized =
true;
20389 if (OptReusedScalars && !AnyVectorized) {
20390 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20391 Value *RdxVal = TrackedVals.at(
P.first);
20392 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20393 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20394 VectorizedVals.try_emplace(
P.first,
P.second);
20399 if (VectorizedTree) {
20420 if (!AnyBoolLogicOp)
20422 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20423 getRdxOperand(RedOp1, 0) ==
LHS ||
20426 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20427 getRdxOperand(RedOp2, 0) ==
RHS ||
20432 if (
LHS != VectorizedTree)
20443 unsigned Sz = InstVals.
size();
20446 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20449 Value *RdxVal1 = InstVals[
I].second;
20450 Value *StableRdxVal1 = RdxVal1;
20451 auto It1 = TrackedVals.find(RdxVal1);
20452 if (It1 != TrackedVals.end())
20453 StableRdxVal1 = It1->second;
20454 Value *RdxVal2 = InstVals[
I + 1].second;
20455 Value *StableRdxVal2 = RdxVal2;
20456 auto It2 = TrackedVals.find(RdxVal2);
20457 if (It2 != TrackedVals.end())
20458 StableRdxVal2 = It2->second;
20462 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20464 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20465 StableRdxVal2,
"op.rdx", ReductionOps);
20466 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20469 ExtraReds[Sz / 2] = InstVals.
back();
20473 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20477 for (
Value *RdxVal : Candidates) {
20478 if (!Visited.
insert(RdxVal).second)
20480 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20487 bool InitStep =
true;
20488 while (ExtraReductions.
size() > 1) {
20490 FinalGen(ExtraReductions, InitStep);
20491 ExtraReductions.
swap(NewReds);
20494 VectorizedTree = ExtraReductions.
front().second;
20496 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20505 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20512 for (
auto *U :
Ignore->users()) {
20514 "All users must be either in the reduction ops list.");
20517 if (!
Ignore->use_empty()) {
20519 Ignore->replaceAllUsesWith(
P);
20522 V.removeInstructionsAndOperands(RdxOps);
20524 }
else if (!CheckForReusedReductionOps) {
20525 for (ReductionOpsType &RdxOps : ReductionOps)
20526 for (
Value *RdxOp : RdxOps)
20527 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20529 return VectorizedTree;
20539 Type *ScalarTy = ReducedVals.
front()->getType();
20540 unsigned ReduxWidth = ReducedVals.
size();
20549 int Cnt = ReducedVals.
size();
20550 for (
Value *RdxVal : ReducedVals) {
20555 Cost += GenCostFn();
20560 auto *RdxOp = cast<Instruction>(U);
20561 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20569 Cost += ScalarCost;
20571 Cost += GenCostFn();
20576 case RecurKind::Add:
20577 case RecurKind::Mul:
20578 case RecurKind::Or:
20579 case RecurKind::And:
20580 case RecurKind::Xor:
20581 case RecurKind::FAdd:
20582 case RecurKind::FMul: {
20585 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20588 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20600 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20601 std::make_pair(RedTy,
true));
20602 if (RType == RedTy) {
20612 ScalarCost = EvaluateScalarCost([&]() {
20617 case RecurKind::FMax:
20618 case RecurKind::FMin:
20619 case RecurKind::FMaximum:
20620 case RecurKind::FMinimum:
20621 case RecurKind::SMax:
20622 case RecurKind::SMin:
20623 case RecurKind::UMax:
20624 case RecurKind::UMin: {
20628 ScalarCost = EvaluateScalarCost([&]() {
20638 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20640 <<
" (It is a splitting reduction)\n");
20641 return VectorCost - ScalarCost;
20647 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20648 assert(RdxKind != RecurKind::FMulAdd &&
20649 "A call to the llvm.fmuladd intrinsic is not handled yet");
20651 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20652 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20653 RdxKind == RecurKind::Add &&
20658 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20659 ++NumVectorInstructions;
20662 ++NumVectorInstructions;
20669 assert(IsSupportedHorRdxIdentityOp &&
20670 "The optimization of matched scalar identity horizontal reductions "
20671 "must be supported.");
20673 return VectorizedValue;
20675 case RecurKind::Add: {
20677 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20679 << VectorizedValue <<
". (HorRdx)\n");
20680 return Builder.
CreateMul(VectorizedValue, Scale);
20682 case RecurKind::Xor: {
20684 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20685 <<
". (HorRdx)\n");
20688 return VectorizedValue;
20690 case RecurKind::FAdd: {
20692 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20694 << VectorizedValue <<
". (HorRdx)\n");
20695 return Builder.
CreateFMul(VectorizedValue, Scale);
20697 case RecurKind::And:
20698 case RecurKind::Or:
20699 case RecurKind::SMax:
20700 case RecurKind::SMin:
20701 case RecurKind::UMax:
20702 case RecurKind::UMin:
20703 case RecurKind::FMax:
20704 case RecurKind::FMin:
20705 case RecurKind::FMaximum:
20706 case RecurKind::FMinimum:
20708 return VectorizedValue;
20709 case RecurKind::Mul:
20710 case RecurKind::FMul:
20711 case RecurKind::FMulAdd:
20712 case RecurKind::IAnyOf:
20713 case RecurKind::FAnyOf:
20714 case RecurKind::IFindLastIV:
20715 case RecurKind::FFindLastIV:
20716 case RecurKind::None:
20728 assert(IsSupportedHorRdxIdentityOp &&
20729 "The optimization of matched scalar identity horizontal reductions "
20730 "must be supported.");
20732 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20733 if (VTy->getElementType() != VL.
front()->getType()) {
20737 R.isSignedMinBitwidthRootNode());
20740 case RecurKind::Add: {
20743 for (
Value *V : VL) {
20744 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20745 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20749 << VectorizedValue <<
". (HorRdx)\n");
20750 return Builder.
CreateMul(VectorizedValue, Scale);
20752 case RecurKind::And:
20753 case RecurKind::Or:
20756 <<
". (HorRdx)\n");
20757 return VectorizedValue;
20758 case RecurKind::SMax:
20759 case RecurKind::SMin:
20760 case RecurKind::UMax:
20761 case RecurKind::UMin:
20762 case RecurKind::FMax:
20763 case RecurKind::FMin:
20764 case RecurKind::FMaximum:
20765 case RecurKind::FMinimum:
20768 <<
". (HorRdx)\n");
20769 return VectorizedValue;
20770 case RecurKind::Xor: {
20776 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20778 std::iota(
Mask.begin(),
Mask.end(), 0);
20779 bool NeedShuffle =
false;
20780 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20782 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20783 if (Cnt % 2 == 0) {
20785 NeedShuffle =
true;
20791 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20795 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20796 return VectorizedValue;
20798 case RecurKind::FAdd: {
20801 for (
Value *V : VL) {
20802 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20803 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20806 return Builder.
CreateFMul(VectorizedValue, Scale);
20808 case RecurKind::Mul:
20809 case RecurKind::FMul:
20810 case RecurKind::FMulAdd:
20811 case RecurKind::IAnyOf:
20812 case RecurKind::FAnyOf:
20813 case RecurKind::IFindLastIV:
20814 case RecurKind::FFindLastIV:
20815 case RecurKind::None:
20825 return HorizontalReduction::getRdxKind(V);
20828 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20829 return cast<FixedVectorType>(IE->getType())->getNumElements();
20831 unsigned AggregateSize = 1;
20832 auto *
IV = cast<InsertValueInst>(InsertInst);
20833 Type *CurrentType =
IV->getType();
20835 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20836 for (
auto *Elt : ST->elements())
20837 if (Elt != ST->getElementType(0))
20838 return std::nullopt;
20839 AggregateSize *= ST->getNumElements();
20840 CurrentType = ST->getElementType(0);
20841 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20842 AggregateSize *= AT->getNumElements();
20843 CurrentType = AT->getElementType();
20844 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20845 AggregateSize *= VT->getNumElements();
20846 return AggregateSize;
20848 return AggregateSize;
20850 return std::nullopt;
20859 unsigned OperandOffset,
const BoUpSLP &R) {
20862 std::optional<unsigned> OperandIndex =
20864 if (!OperandIndex || R.isDeleted(LastInsertInst))
20866 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20868 BuildVectorOpds, InsertElts, *OperandIndex, R);
20871 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20872 InsertElts[*OperandIndex] = LastInsertInst;
20874 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20875 }
while (LastInsertInst !=
nullptr &&
20876 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20900 assert((isa<InsertElementInst>(LastInsertInst) ||
20901 isa<InsertValueInst>(LastInsertInst)) &&
20902 "Expected insertelement or insertvalue instruction!");
20905 "Expected empty result vectors!");
20908 if (!AggregateSize)
20910 BuildVectorOpds.
resize(*AggregateSize);
20911 InsertElts.
resize(*AggregateSize);
20917 if (BuildVectorOpds.
size() >= 2)
20935 auto DominatedReduxValue = [&](
Value *R) {
20936 return isa<Instruction>(R) &&
20937 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20943 if (
P->getIncomingBlock(0) == ParentBB) {
20944 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20945 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20946 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20949 if (Rdx && DominatedReduxValue(Rdx))
20962 if (
P->getIncomingBlock(0) == BBLatch) {
20963 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20964 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20965 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20968 if (Rdx && DominatedReduxValue(Rdx))
21002 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
21003 isa<IntrinsicInst>(Root)) &&
21004 "Expected binop, select, or intrinsic for reduction matching");
21006 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
21008 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21010 return dyn_cast<Instruction>(
RHS);
21012 return dyn_cast<Instruction>(
LHS);
21019 Value *Op0 =
nullptr;
21020 Value *Op1 =
nullptr;
21023 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21029 Value *B0 =
nullptr, *B1 =
nullptr;
21034bool SLPVectorizerPass::vectorizeHorReduction(
21039 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21041 if (Root->
getParent() != BB || isa<PHINode>(Root))
21045 auto SelectRoot = [&]() {
21064 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21065 Stack.emplace(SelectRoot(), 0);
21069 if (
R.isAnalyzedReductionRoot(Inst))
21074 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21076 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21078 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21079 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21086 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21091 while (!
Stack.empty()) {
21094 std::tie(Inst, Level) =
Stack.front();
21099 if (
R.isDeleted(Inst))
21101 if (
Value *VectorizedV = TryToReduce(Inst)) {
21103 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21105 Stack.emplace(
I, Level);
21108 if (
R.isDeleted(Inst))
21112 if (!TryAppendToPostponedInsts(Inst)) {
21123 if (VisitedInstrs.
insert(
Op).second)
21124 if (
auto *
I = dyn_cast<Instruction>(
Op))
21127 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21128 !
R.isDeleted(
I) &&
I->getParent() == BB)
21129 Stack.emplace(
I, Level);
21137 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21138 Res |= tryToVectorize(PostponedInsts, R);
21145 for (
Value *V : Insts)
21146 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21147 Res |= tryToVectorize(Inst, R);
21151bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21154 if (!
R.canMapToVector(IVI->
getType()))
21162 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21163 R.getORE()->emit([&]() {
21165 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21166 "trying reduction first.";
21170 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21172 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21182 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21186 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21187 R.getORE()->emit([&]() {
21189 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21190 "trying reduction first.";
21194 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21195 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21198template <
typename T>
21203 bool MaxVFOnly,
BoUpSLP &R) {
21204 bool Changed =
false;
21215 auto *
I = dyn_cast<Instruction>(*IncIt);
21216 if (!
I || R.isDeleted(
I)) {
21220 auto *SameTypeIt = IncIt;
21221 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21222 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21223 AreCompatible(*SameTypeIt, *IncIt))) {
21224 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21226 if (
I && !R.isDeleted(
I))
21231 unsigned NumElts = VL.
size();
21232 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21233 << NumElts <<
")\n");
21243 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21246 VL.
swap(Candidates);
21247 Candidates.
clear();
21249 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21255 auto GetMinNumElements = [&R](
Value *V) {
21256 unsigned EltSize = R.getVectorElementSize(V);
21257 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21259 if (NumElts < GetMinNumElements(*IncIt) &&
21260 (Candidates.
empty() ||
21261 Candidates.
front()->getType() == (*IncIt)->getType())) {
21263 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21269 if (Candidates.
size() > 1 &&
21270 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21271 if (TryToVectorizeHelper(Candidates,
false)) {
21274 }
else if (MaxVFOnly) {
21277 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21279 auto *
I = dyn_cast<Instruction>(*It);
21280 if (!
I || R.isDeleted(
I)) {
21284 auto *SameTypeIt = It;
21285 while (SameTypeIt !=
End &&
21286 (!isa<Instruction>(*SameTypeIt) ||
21287 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21288 AreCompatible(*SameTypeIt, *It))) {
21289 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21291 if (
I && !R.isDeleted(
I))
21294 unsigned NumElts = VL.
size();
21295 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21301 Candidates.
clear();
21305 IncIt = SameTypeIt;
21317template <
bool IsCompatibility>
21322 "Expected valid element types only.");
21324 return IsCompatibility;
21325 auto *CI1 = cast<CmpInst>(V);
21326 auto *CI2 = cast<CmpInst>(V2);
21327 if (CI1->getOperand(0)->getType()->getTypeID() <
21329 return !IsCompatibility;
21330 if (CI1->getOperand(0)->getType()->getTypeID() >
21333 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21335 return !IsCompatibility;
21336 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21345 if (BasePred1 < BasePred2)
21346 return !IsCompatibility;
21347 if (BasePred1 > BasePred2)
21350 bool CI1Preds = Pred1 == BasePred1;
21351 bool CI2Preds = Pred2 == BasePred1;
21352 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21353 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21354 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21358 return !IsCompatibility;
21361 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21362 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21363 if (IsCompatibility) {
21364 if (I1->getParent() != I2->getParent())
21371 return NodeI2 !=
nullptr;
21374 assert((NodeI1 == NodeI2) ==
21376 "Different nodes should have different DFS numbers");
21377 if (NodeI1 != NodeI2)
21381 if (S && (IsCompatibility || !S.isAltShuffle()))
21383 if (IsCompatibility)
21385 if (I1->getOpcode() != I2->getOpcode())
21386 return I1->getOpcode() < I2->getOpcode();
21389 return IsCompatibility;
21392template <
typename ItT>
21395 bool Changed =
false;
21398 if (
R.isDeleted(
I))
21401 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21402 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21403 if (
R.isDeleted(
I))
21409 if (
R.isDeleted(
I))
21411 Changed |= tryToVectorize(
I, R);
21418 return compareCmp<false>(V, V2, *TLI, *DT);
21421 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21424 return compareCmp<true>(V1, V2, *TLI, *DT);
21431 if (Vals.
size() <= 1)
21433 Changed |= tryToVectorizeSequence<Value>(
21434 Vals, CompareSorter, AreCompatibleCompares,
21437 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21439 auto *Select = dyn_cast<SelectInst>(U);
21441 Select->getParent() != cast<Instruction>(V)->getParent();
21444 if (ArePossiblyReducedInOtherBlock)
21446 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21452bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21454 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21455 "This function only accepts Insert instructions");
21456 bool OpsChanged =
false;
21458 for (
auto *
I :
reverse(Instructions)) {
21460 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21462 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21464 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21465 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21467 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21470 if (
R.isDeleted(
I))
21472 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21473 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21476 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21478 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21479 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21480 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21485 OpsChanged |= tryToVectorize(PostponedInsts, R);
21492 bool Changed =
false;
21499 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21502 "Expected vectorizable types only.");
21510 V2->getType()->getScalarSizeInBits())
21513 V2->getType()->getScalarSizeInBits())
21517 if (Opcodes1.
size() < Opcodes2.
size())
21519 if (Opcodes1.
size() > Opcodes2.
size())
21521 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21524 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21525 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21530 return NodeI2 !=
nullptr;
21533 assert((NodeI1 == NodeI2) ==
21535 "Different nodes should have different DFS numbers");
21536 if (NodeI1 != NodeI2)
21539 if (S && !S.isAltShuffle())
21541 return I1->getOpcode() < I2->getOpcode();
21550 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21551 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21559 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21560 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21564 auto ValID1 = Opcodes1[
I]->getValueID();
21565 auto ValID2 = Opcodes2[
I]->getValueID();
21566 if (ValID1 == ValID2)
21568 if (ValID1 < ValID2)
21570 if (ValID1 > ValID2)
21579 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21583 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21586 if (V1->getType() !=
V2->getType())
21590 if (Opcodes1.
size() != Opcodes2.
size())
21592 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21594 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21596 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21597 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21598 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21600 if (
I1->getParent() != I2->getParent())
21606 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21608 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21614 bool HaveVectorizedPhiNodes =
false;
21619 auto *
P = dyn_cast<PHINode>(&
I);
21625 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21638 if (!Opcodes.
empty())
21642 while (!Nodes.
empty()) {
21643 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21646 for (
Value *V :
PHI->incoming_values()) {
21647 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21648 Nodes.push_back(PHI1);
21656 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21657 Incoming, PHICompare, AreCompatiblePHIs,
21659 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21662 Changed |= HaveVectorizedPhiNodes;
21663 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21664 auto *
PHI = dyn_cast<PHINode>(
P.first);
21665 return !
PHI ||
R.isDeleted(
PHI);
21667 PHIToOpcodes.
clear();
21669 }
while (HaveVectorizedPhiNodes);
21671 VisitedInstrs.
clear();
21673 InstSetVector PostProcessInserts;
21677 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21678 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21679 if (VectorizeCmps) {
21680 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21681 PostProcessCmps.
clear();
21683 PostProcessInserts.clear();
21688 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21689 return PostProcessCmps.
contains(Cmp);
21690 return isa<InsertElementInst, InsertValueInst>(
I) &&
21691 PostProcessInserts.contains(
I);
21697 return I->use_empty() &&
21698 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21703 if (isa<ScalableVectorType>(It->getType()))
21707 if (
R.isDeleted(&*It))
21710 if (!VisitedInstrs.
insert(&*It).second) {
21711 if (HasNoUsers(&*It) &&
21712 VectorizeInsertsAndCmps(It->isTerminator())) {
21722 if (isa<DbgInfoIntrinsic>(It))
21726 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21728 if (
P->getNumIncomingValues() == 2) {
21731 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21740 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21745 if (BB ==
P->getIncomingBlock(
I) ||
21751 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21752 PI && !IsInPostProcessInstrs(PI)) {
21754 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21756 if (Res &&
R.isDeleted(
P)) {
21766 if (HasNoUsers(&*It)) {
21767 bool OpsChanged =
false;
21768 auto *
SI = dyn_cast<StoreInst>(It);
21778 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21779 SI->getValueOperand()->hasOneUse();
21781 if (TryToVectorizeRoot) {
21782 for (
auto *V : It->operand_values()) {
21785 if (
auto *VI = dyn_cast<Instruction>(V);
21786 VI && !IsInPostProcessInstrs(VI))
21788 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21795 VectorizeInsertsAndCmps(It->isTerminator());
21806 if (isa<InsertElementInst, InsertValueInst>(It))
21807 PostProcessInserts.insert(&*It);
21808 else if (isa<CmpInst>(It))
21809 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21816 auto Changed =
false;
21817 for (
auto &Entry : GEPs) {
21820 if (
Entry.second.size() < 2)
21823 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21824 <<
Entry.second.size() <<
".\n");
21832 return !R.isDeleted(GEP);
21834 if (It ==
Entry.second.end())
21836 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21837 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21838 if (MaxVecRegSize < EltSize)
21841 unsigned MaxElts = MaxVecRegSize / EltSize;
21842 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21843 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21856 Candidates.remove_if([&R](
Value *
I) {
21857 return R.isDeleted(cast<Instruction>(
I)) ||
21858 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21866 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21867 auto *GEPI = GEPList[
I];
21868 if (!Candidates.count(GEPI))
21871 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21872 auto *GEPJ = GEPList[J];
21874 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21875 Candidates.remove(GEPI);
21876 Candidates.remove(GEPJ);
21877 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21878 Candidates.remove(GEPJ);
21885 if (Candidates.
size() < 2)
21892 auto BundleIndex = 0
u;
21893 for (
auto *V : Candidates) {
21894 auto *
GEP = cast<GetElementPtrInst>(V);
21895 auto *GEPIdx =
GEP->idx_begin()->get();
21896 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21897 Bundle[BundleIndex++] = GEPIdx;
21909 Changed |= tryToVectorizeList(Bundle, R);
21915bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21916 bool Changed =
false;
21921 if (
V->getValueOperand()->getType()->getTypeID() <
21922 V2->getValueOperand()->getType()->getTypeID())
21924 if (
V->getValueOperand()->getType()->getTypeID() >
21925 V2->getValueOperand()->getType()->getTypeID())
21927 if (
V->getPointerOperandType()->getTypeID() <
21928 V2->getPointerOperandType()->getTypeID())
21930 if (
V->getPointerOperandType()->getTypeID() >
21931 V2->getPointerOperandType()->getTypeID())
21933 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21934 V2->getValueOperand()->getType()->getScalarSizeInBits())
21936 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21937 V2->getValueOperand()->getType()->getScalarSizeInBits())
21940 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21941 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21945 DT->
getNode(I2->getParent());
21946 assert(NodeI1 &&
"Should only process reachable instructions");
21947 assert(NodeI2 &&
"Should only process reachable instructions");
21948 assert((NodeI1 == NodeI2) ==
21950 "Different nodes should have different DFS numbers");
21951 if (NodeI1 != NodeI2)
21953 return I1->getOpcode() < I2->getOpcode();
21955 return V->getValueOperand()->getValueID() <
21956 V2->getValueOperand()->getValueID();
21968 isa<UndefValue>(
V2->getValueOperand()))
21971 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21972 if (
I1->getParent() != I2->getParent())
21977 isa<Constant>(
V2->getValueOperand()))
21980 V2->getValueOperand()->getValueID();
21985 for (
auto &Pair : Stores) {
21986 if (Pair.second.size() < 2)
21990 << Pair.second.size() <<
".\n");
21999 Pair.second.rend());
22000 Changed |= tryToVectorizeSequence<StoreInst>(
22001 ReversedStores, StoreSorter, AreCompatibleStores,
22003 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
const SmallVectorImpl< Type * > & getArgTypes() const
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.