73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
111STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 "Controls which SLP graphs should be vectorized.");
118 cl::desc(
"Run the SLP vectorization passes"));
122 cl::desc(
"Enable vectorization for wider vector utilization"));
126 cl::desc(
"Only vectorize if you gain more than this "
131 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
136 cl::desc(
"Attempt to vectorize horizontal reductions"));
141 "Attempt to vectorize horizontal reductions feeding into a store"));
145 cl::desc(
"Attempt to vectorize for this register size in bits"));
149 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
157 cl::desc(
"Limit the size of the SLP scheduling region per block"));
161 cl::desc(
"Attempt to vectorize for this register size in bits"));
165 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
169 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
175 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
184 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
188 cl::desc(
"The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
193 cl::desc(
"The maximum stride, considered to be profitable."));
197 cl::desc(
"Display the SLP trees with Graphviz"));
201 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
232 if (
SLPReVec && isa<FixedVectorType>(Ty))
234 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 if (
auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (
auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (
auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
271 Type *Ty,
unsigned Sz) {
276 if (NumParts == 0 || NumParts >= Sz)
291 if (NumParts == 0 || NumParts >= Sz)
296 return (Sz / RegVF) * RegVF;
306 for (
unsigned I : seq<unsigned>(Mask.size()))
308 I * VecTyNumElements, VecTyNumElements)))
310 : Mask[
I] * VecTyNumElements + J;
341 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
343 auto *SV = cast<ShuffleVectorInst>(VL.
front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
352 unsigned NumGroup = 0;
353 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
355 Value *Src = SV->getOperand(0);
359 auto *SV = cast<ShuffleVectorInst>(V);
361 if (SV->getOperand(0) != Src)
364 if (!SV->isExtractSubvectorMask(Index))
366 ExpectedIndex.
set(Index / ShuffleMaskSize);
370 if (!ExpectedIndex.
all())
374 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
392 auto *SV = cast<ShuffleVectorInst>(VL.
front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 unsigned AccumulateLength = 0;
397 for (
Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (
int M : SV->getShuffleMask())
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
420 auto *
I = dyn_cast<Instruction>(V);
421 if (!
I || isa<ExtractValueInst>(
I))
423 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
425 if (isa<ExtractElementInst>(
I))
427 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
443 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
452 OS <<
"Idx: " <<
Idx <<
", ";
453 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
461 auto *It =
find_if(VL, IsaPred<Instruction>);
470 if (isa<PoisonValue>(V))
472 auto *
II = dyn_cast<Instruction>(V);
476 if (BB !=
II->getParent())
493 Value *FirstNonUndef =
nullptr;
494 for (
Value *V : VL) {
495 if (isa<UndefValue>(V))
497 if (!FirstNonUndef) {
501 if (V != FirstNonUndef)
504 return FirstNonUndef !=
nullptr;
509 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
510 return Cmp->isCommutative();
511 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
532 (BO->getOpcode() == Instruction::FSub &&
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 return I->isCommutative();
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
548 if (
const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
555 if (CI->getValue().uge(VT->getNumElements()))
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
569 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
571 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
576 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
580 Type *CurrentType =
IV->getType();
581 for (
unsigned I :
IV->indices()) {
582 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(
I);
585 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
620 if (MaskArg == UseMask::UndefsAsMask)
624 if (MaskArg == UseMask::FirstArg &&
Value < VF)
625 UseMask.reset(
Value);
626 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
627 UseMask.reset(
Value - VF);
635template <
bool IsPoisonOnly = false>
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
642 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
645 auto *
C = dyn_cast<Constant>(V);
647 if (!UseMask.empty()) {
649 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
651 if (isa<T>(
II->getOperand(1)))
658 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
666 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
673 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
674 if (
Constant *Elem =
C->getAggregateElement(
I))
676 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
704static std::optional<TargetTransformInfo::ShuffleKind>
707 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
711 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
712 auto *EI = dyn_cast<ExtractElementInst>(V);
715 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
718 return std::max(S, VTy->getNumElements());
721 Value *Vec1 =
nullptr;
722 Value *Vec2 =
nullptr;
724 auto *EE = dyn_cast<ExtractElementInst>(V);
727 Value *Vec = EE->getVectorOperand();
728 if (isa<UndefValue>(Vec))
733 ShuffleMode CommonShuffleMode =
Unknown;
735 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
737 if (isa<UndefValue>(VL[
I]))
739 auto *EI = cast<ExtractElementInst>(VL[
I]);
740 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 auto *Vec = EI->getVectorOperand();
744 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
747 if (isa<UndefValue>(Vec)) {
750 if (isa<UndefValue>(EI->getIndexOperand()))
752 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
758 unsigned IntIdx =
Idx->getValue().getZExtValue();
765 if (!Vec1 || Vec1 == Vec) {
767 }
else if (!Vec2 || Vec2 == Vec) {
773 if (CommonShuffleMode == Permute)
777 if (Mask[
I] %
Size !=
I) {
778 CommonShuffleMode = Permute;
781 CommonShuffleMode =
Select;
784 if (CommonShuffleMode ==
Select && Vec2)
795 assert((Opcode == Instruction::ExtractElement ||
796 Opcode == Instruction::ExtractValue) &&
797 "Expected extractelement or extractvalue instruction.");
798 if (Opcode == Instruction::ExtractElement) {
799 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
802 return CI->getZExtValue();
804 auto *EI = cast<ExtractValueInst>(E);
805 if (EI->getNumIndices() != 1)
807 return *EI->idx_begin();
813class InstructionsState {
820 assert(valid() &&
"InstructionsState is invalid.");
825 assert(valid() &&
"InstructionsState is invalid.");
830 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
832 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
835 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
838 unsigned CheckedOpcode =
I->getOpcode();
839 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
843 bool valid()
const {
return MainOp && AltOp; }
845 explicit operator bool()
const {
return valid(); }
847 InstructionsState() =
delete;
849 : MainOp(MainOp), AltOp(AltOp) {}
850 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
876 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
877 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
878 BaseOp0 == Op0 || BaseOp1 == Op1 ||
889 "Assessing comparisons of different types?");
899 return (BasePred == Pred &&
901 (BasePred == SwappedPred &&
911 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
912 return InstructionsState::invalid();
914 auto *It =
find_if(VL, IsaPred<Instruction>);
916 return InstructionsState::invalid();
919 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
920 if ((VL.
size() > 2 && !isa<PHINode>(V) && InstCnt < VL.
size() / 2) ||
921 (VL.
size() == 2 && InstCnt < 2))
922 return InstructionsState::invalid();
924 bool IsCastOp = isa<CastInst>(V);
925 bool IsBinOp = isa<BinaryOperator>(V);
926 bool IsCmpOp = isa<CmpInst>(V);
929 unsigned Opcode = cast<Instruction>(V)->getOpcode();
930 unsigned AltOpcode = Opcode;
931 unsigned AltIndex = std::distance(VL.
begin(), It);
933 bool SwappedPredsCompatible = [&]() {
937 UniquePreds.
insert(BasePred);
938 UniqueNonSwappedPreds.
insert(BasePred);
939 for (
Value *V : VL) {
940 auto *
I = dyn_cast<CmpInst>(V);
946 UniqueNonSwappedPreds.
insert(CurrentPred);
947 if (!UniquePreds.
contains(CurrentPred) &&
948 !UniquePreds.
contains(SwappedCurrentPred))
949 UniquePreds.
insert(CurrentPred);
954 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
958 auto *IBase = cast<Instruction>(V);
961 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
965 return InstructionsState::invalid();
967 bool AnyPoison = InstCnt != VL.
size();
968 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
969 auto *
I = dyn_cast<Instruction>(VL[Cnt]);
976 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
977 return InstructionsState::invalid();
978 unsigned InstOpcode =
I->getOpcode();
979 if (IsBinOp && isa<BinaryOperator>(
I)) {
980 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
984 AltOpcode = InstOpcode;
988 }
else if (IsCastOp && isa<CastInst>(
I)) {
989 Value *Op0 = IBase->getOperand(0);
991 Value *Op1 =
I->getOperand(0);
994 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
996 if (Opcode == AltOpcode) {
999 "Cast isn't safe for alternation, logic needs to be updated!");
1000 AltOpcode = InstOpcode;
1005 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
1006 auto *BaseInst = cast<CmpInst>(V);
1007 Type *Ty0 = BaseInst->getOperand(0)->getType();
1008 Type *Ty1 = Inst->getOperand(0)->getType();
1010 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1011 assert(InstOpcode == AltOpcode &&
1012 "Alternate instructions are only supported by BinaryOperator "
1020 if ((E == 2 || SwappedPredsCompatible) &&
1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1026 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1030 }
else if (BasePred != CurrentPred) {
1033 "CmpInst isn't safe for alternation, logic needs to be updated!");
1038 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1042 }
else if (InstOpcode == Opcode) {
1043 assert(InstOpcode == AltOpcode &&
1044 "Alternate instructions are only supported by BinaryOperator and "
1046 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1047 if (Gep->getNumOperands() != 2 ||
1048 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1049 return InstructionsState::invalid();
1050 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1052 return InstructionsState::invalid();
1053 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1054 auto *BaseLI = cast<LoadInst>(IBase);
1055 if (!LI->isSimple() || !BaseLI->isSimple())
1056 return InstructionsState::invalid();
1057 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1058 auto *
CallBase = cast<CallInst>(IBase);
1060 return InstructionsState::invalid();
1061 if (Call->hasOperandBundles() &&
1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1067 return InstructionsState::invalid();
1070 return InstructionsState::invalid();
1073 if (Mappings.
size() != BaseMappings.
size() ||
1074 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1075 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1076 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1077 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1078 Mappings.
front().Shape.Parameters !=
1079 BaseMappings.
front().Shape.Parameters)
1080 return InstructionsState::invalid();
1085 return InstructionsState::invalid();
1088 return InstructionsState(cast<Instruction>(V),
1089 cast<Instruction>(VL[AltIndex]));
1106 unsigned Opcode = UserInst->
getOpcode();
1108 case Instruction::Load: {
1109 LoadInst *LI = cast<LoadInst>(UserInst);
1112 case Instruction::Store: {
1113 StoreInst *SI = cast<StoreInst>(UserInst);
1114 return (SI->getPointerOperand() == Scalar);
1116 case Instruction::Call: {
1117 CallInst *CI = cast<CallInst>(UserInst);
1120 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1121 Arg.value().get() == Scalar;
1133 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1140 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1141 return LI->isSimple();
1143 return SI->isSimple();
1145 return !
MI->isVolatile();
1153 bool ExtendingManyInputs =
false) {
1154 if (SubMask.
empty())
1157 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1160 "SubMask with many inputs support must be larger than the mask.");
1162 Mask.append(SubMask.
begin(), SubMask.
end());
1166 int TermValue = std::min(Mask.size(), SubMask.
size());
1167 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1169 (!ExtendingManyInputs &&
1170 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1172 NewMask[
I] = Mask[SubMask[
I]];
1188 const unsigned Sz = Order.
size();
1191 for (
unsigned I = 0;
I < Sz; ++
I) {
1193 UnusedIndices.
reset(Order[
I]);
1195 MaskedIndices.
set(
I);
1197 if (MaskedIndices.
none())
1200 "Non-synced masked/available indices.");
1204 assert(
Idx >= 0 &&
"Indices must be synced.");
1215 Type *ScalarTy = VL[0]->getType();
1218 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1219 if (isa<PoisonValue>(VL[Lane]))
1221 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1222 OpcodeMask.
set(Lane * ScalarTyNumElements,
1223 Lane * ScalarTyNumElements + ScalarTyNumElements);
1233 const unsigned E = Indices.
size();
1235 for (
unsigned I = 0;
I < E; ++
I)
1236 Mask[Indices[
I]] =
I;
1242 assert(!Mask.empty() &&
"Expected non-empty mask.");
1246 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1248 Scalars[Mask[
I]] = Prev[
I];
1256 auto *
I = dyn_cast<Instruction>(V);
1261 auto *IO = dyn_cast<Instruction>(V);
1264 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1273 auto *
I = dyn_cast<Instruction>(V);
1277 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1279 auto *IU = dyn_cast<Instruction>(U);
1282 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1298 return !VL.
empty() &&
1314 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1318namespace slpvectorizer {
1323 struct ScheduleData;
1347 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1348 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1399 return !VectorizableTree.
empty() &&
1400 !VectorizableTree.
front()->UserTreeIndices.empty();
1405 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1406 return VectorizableTree.
front()->Scalars;
1412 const TreeEntry &Root = *VectorizableTree.
front().get();
1413 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1414 !Root.Scalars.front()->getType()->isIntegerTy())
1415 return std::nullopt;
1416 auto It = MinBWs.
find(&Root);
1417 if (It != MinBWs.
end())
1421 if (Root.getOpcode() == Instruction::ZExt ||
1422 Root.getOpcode() == Instruction::SExt)
1423 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1424 Root.getOpcode() == Instruction::SExt);
1425 return std::nullopt;
1431 return MinBWs.
at(VectorizableTree.
front().get()).second;
1436 if (ReductionBitWidth == 0 ||
1437 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1438 ReductionBitWidth >=
1439 DL->getTypeSizeInBits(
1440 VectorizableTree.
front()->Scalars.front()->getType()))
1442 VectorizableTree.
front()->Scalars.front()->getType(),
1443 VectorizableTree.
front()->getVectorFactor());
1446 VectorizableTree.
front()->Scalars.front()->getContext(),
1448 VectorizableTree.
front()->getVectorFactor());
1463 VectorizableTree.
clear();
1464 ScalarToTreeEntry.clear();
1465 MultiNodeScalars.clear();
1467 NonScheduledFirst.
clear();
1468 EntryToLastInstruction.clear();
1469 LoadEntriesToVectorize.
clear();
1470 IsGraphTransformMode =
false;
1471 GatheredLoadsEntriesFirst.reset();
1472 ExternalUses.
clear();
1473 ExternalUsesAsOriginalScalar.clear();
1474 for (
auto &Iter : BlocksSchedules) {
1475 BlockScheduling *BS = Iter.second.get();
1479 ReductionBitWidth = 0;
1481 CastMaxMinBWSizes.reset();
1482 ExtraBitWidthNodes.
clear();
1483 InstrElementSize.clear();
1484 UserIgnoreList =
nullptr;
1485 PostponedGathers.
clear();
1486 ValueToGatherNodes.
clear();
1502 assert(!Order.
empty() &&
"expected non-empty order");
1503 const unsigned Sz = Order.
size();
1505 return P.value() ==
P.index() ||
P.value() == Sz;
1558 return MaxVecRegSize;
1563 return MinVecRegSize;
1571 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1573 return MaxVF ? MaxVF : UINT_MAX;
1625 unsigned *BestVF =
nullptr,
1626 bool TryRecursiveCheck =
true)
const;
1634 template <
typename T>
1661 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1662 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1684 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1685 MaxLevel(MaxLevel) {}
1739 if (isa<LoadInst>(V1)) {
1741 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1746 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1748 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1751 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1754 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1756 ((
int)V1->getNumUses() == NumLanes ||
1757 AllUsersAreInternal(V1, V2)))
1763 auto CheckSameEntryOrFail = [&]() {
1764 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1765 TE1 && TE1 == R.getTreeEntry(V2))
1770 auto *LI1 = dyn_cast<LoadInst>(V1);
1771 auto *LI2 = dyn_cast<LoadInst>(V2);
1773 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1775 return CheckSameEntryOrFail();
1778 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1779 LI2->getPointerOperand(),
DL, SE,
true);
1780 if (!Dist || *Dist == 0) {
1783 R.TTI->isLegalMaskedGather(
1786 return CheckSameEntryOrFail();
1790 if (std::abs(*Dist) > NumLanes / 2)
1799 auto *C1 = dyn_cast<Constant>(V1);
1800 auto *C2 = dyn_cast<Constant>(V2);
1814 if (isa<UndefValue>(V2))
1818 Value *EV2 =
nullptr;
1831 int Dist = Idx2 - Idx1;
1834 if (std::abs(Dist) == 0)
1836 if (std::abs(Dist) > NumLanes / 2)
1843 return CheckSameEntryOrFail();
1846 auto *I1 = dyn_cast<Instruction>(V1);
1847 auto *I2 = dyn_cast<Instruction>(V2);
1849 if (I1->getParent() != I2->getParent())
1850 return CheckSameEntryOrFail();
1858 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1859 !S.isAltShuffle()) &&
1861 return isa<PoisonValue>(V) ||
1862 cast<Instruction>(V)->getNumOperands() ==
1863 S.getMainOp()->getNumOperands();
1869 if (I1 && isa<PoisonValue>(V2))
1872 if (isa<UndefValue>(V2))
1875 return CheckSameEntryOrFail();
1909 int ShallowScoreAtThisLevel =
1918 auto *I1 = dyn_cast<Instruction>(
LHS);
1919 auto *I2 = dyn_cast<Instruction>(
RHS);
1920 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1922 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1923 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1924 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1925 ShallowScoreAtThisLevel))
1926 return ShallowScoreAtThisLevel;
1927 assert(I1 && I2 &&
"Should have early exited.");
1934 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1935 OpIdx1 != NumOperands1; ++OpIdx1) {
1937 int MaxTmpScore = 0;
1938 unsigned MaxOpIdx2 = 0;
1939 bool FoundBest =
false;
1943 ? I2->getNumOperands()
1944 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1945 assert(FromIdx <= ToIdx &&
"Bad index");
1946 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1948 if (Op2Used.
count(OpIdx2))
1953 I1, I2, CurrLevel + 1, {});
1956 TmpScore > MaxTmpScore) {
1957 MaxTmpScore = TmpScore;
1964 Op2Used.
insert(MaxOpIdx2);
1965 ShallowScoreAtThisLevel += MaxTmpScore;
1968 return ShallowScoreAtThisLevel;
1999 struct OperandData {
2000 OperandData() =
default;
2001 OperandData(
Value *V,
bool APO,
bool IsUsed)
2002 : V(V), APO(APO), IsUsed(IsUsed) {}
2012 bool IsUsed =
false;
2021 enum class ReorderingMode {
2035 unsigned ArgSize = 0;
2041 const Loop *L =
nullptr;
2044 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2045 return OpsVec[OpIdx][Lane];
2049 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2050 return OpsVec[OpIdx][Lane];
2055 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2056 OpIdx != NumOperands; ++OpIdx)
2057 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2059 OpsVec[OpIdx][Lane].IsUsed =
false;
2063 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2064 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2076 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2078 Value *IdxLaneV = getData(
Idx, Lane).V;
2079 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2080 isa<ExtractElementInst>(IdxLaneV))
2083 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2086 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2087 if (!isa<Instruction>(OpIdxLnV))
2091 unsigned UniquesCount = Uniques.
size();
2092 auto IdxIt = Uniques.
find(IdxLaneV);
2093 unsigned UniquesCntWithIdxLaneV =
2094 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2095 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2096 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2097 unsigned UniquesCntWithOpIdxLaneV =
2098 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2099 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2101 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2102 UniquesCntWithOpIdxLaneV,
2103 UniquesCntWithOpIdxLaneV -
2105 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2106 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2107 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2116 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2117 Value *IdxLaneV = getData(
Idx, Lane).V;
2118 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2127 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2128 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2130 return R.areAllUsersVectorized(IdxLaneI)
2138 static const int ScoreScaleFactor = 10;
2146 int Lane,
unsigned OpIdx,
unsigned Idx,
2156 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2157 if (Score <= -SplatScore) {
2161 Score += SplatScore;
2167 Score *= ScoreScaleFactor;
2168 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2186 std::optional<unsigned>
2187 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2191 unsigned NumOperands = getNumOperands();
2194 Value *OpLastLane = getData(OpIdx, LastLane).V;
2197 ReorderingMode RMode = ReorderingModes[OpIdx];
2198 if (RMode == ReorderingMode::Failed)
2199 return std::nullopt;
2202 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2208 std::optional<unsigned>
Idx;
2212 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2218 bool IsUsed = RMode == ReorderingMode::Splat ||
2219 RMode == ReorderingMode::Constant ||
2220 RMode == ReorderingMode::Load;
2222 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2224 OperandData &OpData = getData(
Idx, Lane);
2226 bool OpAPO = OpData.APO;
2235 if (OpAPO != OpIdxAPO)
2240 case ReorderingMode::Load:
2241 case ReorderingMode::Opcode: {
2242 bool LeftToRight = Lane > LastLane;
2243 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2244 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2245 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2246 OpIdx,
Idx, IsUsed, UsedLanes);
2247 if (Score >
static_cast<int>(BestOp.Score) ||
2248 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2251 BestOp.Score = Score;
2252 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2256 case ReorderingMode::Constant:
2257 if (isa<Constant>(
Op) ||
2258 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2260 if (isa<Constant>(
Op)) {
2262 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2265 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2269 case ReorderingMode::Splat:
2270 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2271 IsUsed =
Op == OpLastLane;
2272 if (
Op == OpLastLane) {
2274 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2280 case ReorderingMode::Failed:
2286 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2290 return std::nullopt;
2297 unsigned getBestLaneToStartReordering()
const {
2298 unsigned Min = UINT_MAX;
2299 unsigned SameOpNumber = 0;
2310 for (
int I = getNumLanes();
I > 0; --
I) {
2311 unsigned Lane =
I - 1;
2312 OperandsOrderData NumFreeOpsHash =
2313 getMaxNumOperandsThatCanBeReordered(Lane);
2316 if (NumFreeOpsHash.NumOfAPOs < Min) {
2317 Min = NumFreeOpsHash.NumOfAPOs;
2318 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2320 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2321 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2322 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2325 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2326 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2327 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2328 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2329 auto [It, Inserted] =
2330 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2336 unsigned BestLane = 0;
2337 unsigned CntMin = UINT_MAX;
2339 if (
Data.second.first < CntMin) {
2340 CntMin =
Data.second.first;
2341 BestLane =
Data.second.second;
2348 struct OperandsOrderData {
2351 unsigned NumOfAPOs = UINT_MAX;
2354 unsigned NumOpsWithSameOpcodeParent = 0;
2368 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2369 unsigned CntTrue = 0;
2370 unsigned NumOperands = getNumOperands();
2380 bool AllUndefs =
true;
2381 unsigned NumOpsWithSameOpcodeParent = 0;
2385 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2386 const OperandData &OpData = getData(OpIdx, Lane);
2391 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2393 I->getParent() != Parent) {
2394 if (NumOpsWithSameOpcodeParent == 0) {
2395 NumOpsWithSameOpcodeParent = 1;
2397 Parent =
I->getParent();
2399 --NumOpsWithSameOpcodeParent;
2402 ++NumOpsWithSameOpcodeParent;
2406 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2407 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2411 OperandsOrderData
Data;
2412 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2413 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2421 assert((empty() || VL.
size() == getNumLanes()) &&
2422 "Expected same number of lanes");
2425 constexpr unsigned IntrinsicNumOperands = 2;
2427 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.
resize(NumOperands);
2429 unsigned NumLanes = VL.
size();
2430 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].
resize(NumLanes);
2432 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434 "Expected instruction or poison value");
2445 if (isa<PoisonValue>(VL[Lane])) {
2446 OpsVec[OpIdx][Lane] = {
2451 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2452 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2453 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2460 unsigned getNumOperands()
const {
return ArgSize; }
2463 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2466 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2467 return getData(OpIdx, Lane).V;
2471 bool empty()
const {
return OpsVec.
empty(); }
2474 void clear() { OpsVec.
clear(); }
2479 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2480 assert(
Op == getValue(OpIdx, Lane) &&
2481 "Op is expected to be getValue(OpIdx, Lane).");
2483 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2485 bool OpAPO = getData(OpIdx, Lane).APO;
2486 bool IsInvariant = L && L->isLoopInvariant(
Op);
2488 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2492 bool FoundCandidate =
false;
2493 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2494 OperandData &
Data = getData(OpI, Ln);
2495 if (
Data.APO != OpAPO ||
Data.IsUsed)
2497 Value *OpILane = getValue(OpI, Lane);
2498 bool IsConstantOp = isa<Constant>(OpILane);
2507 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2512 isa<Constant>(
Data.V)))) ||
2519 (IsInvariant && !isa<Constant>(
Data.V) &&
2521 L->isLoopInvariant(
Data.V))) {
2522 FoundCandidate =
true;
2529 if (!FoundCandidate)
2532 return getNumLanes() == 2 || Cnt > 1;
2537 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2538 assert(
Op == getValue(OpIdx, Lane) &&
2539 "Op is expected to be getValue(OpIdx, Lane).");
2540 bool OpAPO = getData(OpIdx, Lane).APO;
2541 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2544 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2545 const OperandData &
Data = getData(OpI, Ln);
2546 if (
Data.APO != OpAPO ||
Data.IsUsed)
2548 Value *OpILn = getValue(OpI, Ln);
2549 return (L && L->isLoopInvariant(OpILn)) ||
2561 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2562 L(R.LI->getLoopFor((VL0->
getParent()))) {
2564 appendOperandsOfVL(RootVL, VL0);
2571 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2572 "Expected same num of lanes across all operands");
2573 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2574 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2582 unsigned NumOperands = getNumOperands();
2583 unsigned NumLanes = getNumLanes();
2603 unsigned FirstLane = getBestLaneToStartReordering();
2606 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2607 Value *OpLane0 = getValue(OpIdx, FirstLane);
2610 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2612 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2613 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2614 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2615 else if (isa<LoadInst>(OpILane0))
2616 ReorderingModes[OpIdx] = ReorderingMode::Load;
2618 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2619 }
else if (isa<Constant>(OpLane0)) {
2620 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2621 }
else if (isa<Argument>(OpLane0)) {
2623 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2633 auto &&SkipReordering = [
this]() {
2636 for (
const OperandData &
Data : Op0)
2640 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2659 if (SkipReordering())
2662 bool StrategyFailed =
false;
2670 for (
unsigned I = 0;
I < NumOperands; ++
I)
2671 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2674 UsedLanes.
set(FirstLane);
2675 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2678 int Lane = FirstLane +
Direction * Distance;
2679 if (Lane < 0 || Lane >= (
int)NumLanes)
2681 UsedLanes.
set(Lane);
2683 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2686 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2688 std::optional<unsigned> BestIdx =
2689 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2690 MainAltOps[OpIdx], UsedLanes);
2697 swap(OpIdx, *BestIdx, Lane);
2700 StrategyFailed =
true;
2703 if (MainAltOps[OpIdx].
size() != 2) {
2704 OperandData &AltOp = getData(OpIdx, Lane);
2705 InstructionsState OpS =
2707 if (OpS && OpS.isAltShuffle())
2714 if (!StrategyFailed)
2719#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2722 case ReorderingMode::Load:
2724 case ReorderingMode::Opcode:
2726 case ReorderingMode::Constant:
2728 case ReorderingMode::Splat:
2730 case ReorderingMode::Failed:
2751 const unsigned Indent = 2;
2754 OS <<
"Operand " << Cnt++ <<
"\n";
2755 for (
const OperandData &OpData : OpDataVec) {
2757 if (
Value *V = OpData.V)
2761 OS <<
", APO:" << OpData.APO <<
"}\n";
2783 int BestScore = Limit;
2784 std::optional<int> Index;
2785 for (
int I : seq<int>(0, Candidates.size())) {
2787 Candidates[
I].second,
2790 if (Score > BestScore) {
2805 DeletedInstructions.insert(
I);
2810 template <
typename T>
2813 for (
T *V : DeadVals) {
2814 auto *
I = cast<Instruction>(V);
2815 DeletedInstructions.insert(
I);
2818 for (
T *V : DeadVals) {
2819 if (!V || !Processed.
insert(V).second)
2821 auto *
I = cast<Instruction>(V);
2824 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2825 Entries.push_back(Entry);
2826 auto It = MultiNodeScalars.find(
I);
2827 if (It != MultiNodeScalars.end())
2828 Entries.append(It->second.begin(), It->second.end());
2830 for (
Use &U :
I->operands()) {
2831 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2832 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2834 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2835 return Entry->VectorizedValue == OpI;
2839 I->dropAllReferences();
2841 for (
T *V : DeadVals) {
2842 auto *
I = cast<Instruction>(V);
2843 if (!
I->getParent())
2848 cast<Instruction>(U.getUser()));
2850 "trying to erase instruction with users.");
2851 I->removeFromParent();
2855 while (!DeadInsts.
empty()) {
2858 if (!VI || !VI->getParent())
2861 "Live instruction found in dead worklist!");
2862 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2869 for (
Use &OpU : VI->operands()) {
2870 Value *OpV = OpU.get();
2881 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2882 if (!DeletedInstructions.contains(OpI) &&
2887 VI->removeFromParent();
2888 DeletedInstructions.insert(VI);
2896 return AnalyzedReductionsRoots.count(
I);
2901 AnalyzedReductionsRoots.insert(
I);
2915 AnalyzedReductionsRoots.clear();
2916 AnalyzedReductionVals.
clear();
2917 AnalyzedMinBWVals.
clear();
2929 return NonScheduledFirst.
contains(V);
2942 bool collectValuesToDemote(
2943 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2946 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2956 canReorderOperands(TreeEntry *UserTE,
2963 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2967 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2969 TreeEntry *TE =
nullptr;
2971 TE = getTreeEntry(V);
2972 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2974 auto It = MultiNodeScalars.find(V);
2975 if (It != MultiNodeScalars.end()) {
2976 for (TreeEntry *E : It->second) {
2977 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2985 if (It != VL.
end()) {
2986 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2994 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2995 unsigned OpIdx)
const {
2996 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2997 const_cast<TreeEntry *
>(UserTE), OpIdx);
3001 bool areAllUsersVectorized(
3010 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3015 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3019 getCastContextHint(
const TreeEntry &TE)
const;
3028 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3039 bool ResizeAllowed =
false)
const;
3048 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3049 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3050 unsigned NodeIdx)
const {
3051 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3058 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3063 template <
typename BVTy,
typename ResTy,
typename...
Args>
3064 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3069 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3070 bool PostponedPHIs);
3076 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3083 std::optional<TargetTransformInfo::ShuffleKind>
3095 unsigned NumParts)
const;
3107 std::optional<TargetTransformInfo::ShuffleKind>
3108 isGatherShuffledSingleRegisterEntry(
3125 isGatherShuffledEntry(
3128 unsigned NumParts,
bool ForOrder =
false);
3134 Type *ScalarTy)
const;
3138 void setInsertPointAfterBundle(
const TreeEntry *E);
3148 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3153 void tryToVectorizeGatheredLoads(
3162 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3178 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3182 void reorderGatherNode(TreeEntry &TE);
3186 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3203 [Scalars](
Value *V,
int Idx) {
3204 return (isa<UndefValue>(V) &&
3205 Idx == PoisonMaskElem) ||
3206 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3209 if (!ReorderIndices.empty()) {
3216 return IsSame(Scalars, Mask);
3217 if (VL.
size() == ReuseShuffleIndices.size()) {
3219 return IsSame(Scalars, Mask);
3223 return IsSame(Scalars, ReuseShuffleIndices);
3226 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3227 return isGather() && !UserTreeIndices.empty() &&
3228 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3229 UserTreeIndices.front().UserTE == UserEI.UserTE;
3233 bool hasEqualOperands(
const TreeEntry &TE)
const {
3234 if (
TE.getNumOperands() != getNumOperands())
3237 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3238 unsigned PrevCount =
Used.count();
3239 for (
unsigned K = 0;
K < E; ++
K) {
3242 if (getOperand(K) ==
TE.getOperand(
I)) {
3248 if (PrevCount ==
Used.count())
3257 unsigned getVectorFactor()
const {
3258 if (!ReuseShuffleIndices.empty())
3259 return ReuseShuffleIndices.size();
3260 return Scalars.
size();
3264 bool isGather()
const {
return State == NeedToGather; }
3291 enum CombinedOpcode {
3293 MinMax = Instruction::OtherOpsEnd + 1,
3295 CombinedOpcode CombinedOp = NotCombinedOp;
3309 VecTreeTy &Container;
3333 unsigned InterleaveFactor = 0;
3337 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3339 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3345 assert(Operands[OpIdx].empty() &&
"Already resized?");
3347 "Number of operands is greater than the number of scalars.");
3353 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3354 VLOperands Ops(Scalars, MainOp, R);
3358 setOperand(
I, Ops.getVL(
I));
3380 unsigned getNumOperands()
const {
return Operands.size(); }
3383 Value *getSingleOperand(
unsigned OpIdx)
const {
3385 assert(!Operands[OpIdx].empty() &&
"No operand available");
3390 bool isAltShuffle()
const {
return MainOp != AltOp; }
3393 unsigned CheckedOpcode =
I->getOpcode();
3394 return (getOpcode() == CheckedOpcode ||
3395 getAltOpcode() == CheckedOpcode);
3402 auto *
I = dyn_cast<Instruction>(
Op);
3403 if (
I && isOpcodeOrAlt(
I))
3408 void setOperations(
const InstructionsState &S) {
3409 assert(S &&
"InstructionsState is invalid.");
3410 MainOp = S.getMainOp();
3411 AltOp = S.getAltOp();
3423 unsigned getOpcode()
const {
3424 return MainOp ? MainOp->
getOpcode() : 0;
3427 unsigned getAltOpcode()
const {
3433 int findLaneForValue(
Value *V)
const {
3434 unsigned FoundLane = getVectorFactor();
3435 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3436 std::advance(It, 1)) {
3439 FoundLane = std::distance(Scalars.begin(), It);
3440 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3441 if (!ReorderIndices.
empty())
3442 FoundLane = ReorderIndices[FoundLane];
3443 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3444 if (ReuseShuffleIndices.
empty())
3446 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3447 RIt != ReuseShuffleIndices.
end()) {
3448 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3452 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3465 bool isNonPowOf2Vec()
const {
3467 return IsNonPowerOf2;
3476 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3477 "Reshuffling not supported with non-power-of-2 vectors yet.");
3478 return IsNonPowerOf2;
3481 Value *getOrdered(
unsigned Idx)
const {
3482 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3483 if (ReorderIndices.
empty())
3484 return Scalars[
Idx];
3494 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3495 dbgs() <<
"Operand " << OpI <<
":\n";
3496 for (
const Value *V : Operands[OpI])
3499 dbgs() <<
"Scalars: \n";
3500 for (
Value *V : Scalars)
3502 dbgs() <<
"State: ";
3505 if (InterleaveFactor > 0) {
3506 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3509 dbgs() <<
"Vectorize\n";
3512 case ScatterVectorize:
3513 dbgs() <<
"ScatterVectorize\n";
3515 case StridedVectorize:
3516 dbgs() <<
"StridedVectorize\n";
3519 dbgs() <<
"NeedToGather\n";
3521 case CombinedVectorize:
3522 dbgs() <<
"CombinedVectorize\n";
3525 dbgs() <<
"MainOp: ";
3527 dbgs() << *MainOp <<
"\n";
3530 dbgs() <<
"AltOp: ";
3532 dbgs() << *AltOp <<
"\n";
3535 dbgs() <<
"VectorizedValue: ";
3536 if (VectorizedValue)
3537 dbgs() << *VectorizedValue <<
"\n";
3540 dbgs() <<
"ReuseShuffleIndices: ";
3541 if (ReuseShuffleIndices.
empty())
3544 for (
int ReuseIdx : ReuseShuffleIndices)
3545 dbgs() << ReuseIdx <<
", ";
3547 dbgs() <<
"ReorderIndices: ";
3548 for (
unsigned ReorderIdx : ReorderIndices)
3549 dbgs() << ReorderIdx <<
", ";
3551 dbgs() <<
"UserTreeIndices: ";
3552 for (
const auto &EInfo : UserTreeIndices)
3553 dbgs() << EInfo <<
", ";
3555 if (!CombinedEntriesWithIndices.
empty()) {
3556 dbgs() <<
"Combined entries: ";
3558 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3567 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3570 dbgs() <<
"SLP: " << Banner <<
":\n";
3572 dbgs() <<
"SLP: Costs:\n";
3573 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3574 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3575 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3576 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3577 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3583 std::optional<ScheduleData *> Bundle,
3584 const InstructionsState &S,
3585 const EdgeInfo &UserTreeIdx,
3588 unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593 if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601 const InstructionsState &S,
3602 const EdgeInfo &UserTreeIdx,
3605 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607 "Need to vectorize gather entry?");
3609 if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3614 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *
Last = VectorizableTree.
back().get();
3616 Last->Idx = VectorizableTree.
size() - 1;
3617 Last->State = EntryState;
3622 ReuseShuffleIndices.empty()) &&
3623 "Reshuffling scalars not yet supported for nodes with padding");
3624 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626 if (ReorderIndices.
empty()) {
3629 Last->setOperations(S);
3632 Last->Scalars.assign(VL.
size(),
nullptr);
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3641 Last->setOperations(S);
3642 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3644 if (!
Last->isGather()) {
3645 for (
Value *V : VL) {
3646 const TreeEntry *
TE = getTreeEntry(V);
3648 "Scalar already in tree!");
3651 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3654 ScalarToTreeEntry[
V] =
Last;
3657 ScheduleData *BundleMember = *Bundle;
3658 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3661 "Bundle and VL out of sync");
3663 for (
Value *V : VL) {
3668 BundleMember->TE =
Last;
3669 BundleMember = BundleMember->NextInBundle;
3672 assert(!BundleMember &&
"Bundle and VL out of sync");
3675 bool AllConstsOrCasts =
true;
3678 auto *
I = dyn_cast<CastInst>(V);
3679 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3680 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3681 !UserTreeIdx.UserTE->isGather())
3684 if (AllConstsOrCasts)
3686 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3687 MustGather.
insert(VL.begin(), VL.end());
3690 if (UserTreeIdx.UserTE)
3691 Last->UserTreeIndices.push_back(UserTreeIdx);
3697 TreeEntry::VecTreeTy VectorizableTree;
3702 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3703 VectorizableTree[
Id]->dump();
3709 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3711 const TreeEntry *getTreeEntry(
Value *V)
const {
3712 return ScalarToTreeEntry.lookup(V);
3721 bool areAltOperandsProfitable(
const InstructionsState &S,
3726 TreeEntry::EntryState
3728 bool IsScatterVectorizeUserTE,
3761 using ValueToGatherNodesMap =
3763 ValueToGatherNodesMap ValueToGatherNodes;
3771 bool IsGraphTransformMode =
false;
3774 std::optional<unsigned> GatheredLoadsEntriesFirst;
3777 struct ExternalUser {
3801 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3802 auto It = AliasCache.
find(Key);
3803 if (It != AliasCache.
end())
3808 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3812 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3844 UserList ExternalUses;
3867 struct ScheduleData {
3870 enum { InvalidDeps = -1 };
3872 ScheduleData() =
default;
3875 FirstInBundle =
this;
3876 NextInBundle =
nullptr;
3877 NextLoadStore =
nullptr;
3878 IsScheduled =
false;
3879 SchedulingRegionID = BlockSchedulingRegionID;
3880 clearDependencies();
3887 if (hasValidDependencies()) {
3888 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3890 assert(UnscheduledDeps == Dependencies &&
"invariant");
3894 assert(isSchedulingEntity() &&
3895 "unexpected scheduled state");
3896 for (
const ScheduleData *BundleMember =
this; BundleMember;
3897 BundleMember = BundleMember->NextInBundle) {
3898 assert(BundleMember->hasValidDependencies() &&
3899 BundleMember->UnscheduledDeps == 0 &&
3900 "unexpected scheduled state");
3901 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3902 "only bundle is marked scheduled");
3906 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3907 "all bundle members must be in same basic block");
3913 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3917 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3921 bool isPartOfBundle()
const {
3922 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3927 bool isReady()
const {
3928 assert(isSchedulingEntity() &&
3929 "can't consider non-scheduling entity for ready list");
3930 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3936 int incrementUnscheduledDeps(
int Incr) {
3937 assert(hasValidDependencies() &&
3938 "increment of unscheduled deps would be meaningless");
3939 UnscheduledDeps += Incr;
3940 return FirstInBundle->unscheduledDepsInBundle();
3945 void resetUnscheduledDeps() {
3946 UnscheduledDeps = Dependencies;
3950 void clearDependencies() {
3951 Dependencies = InvalidDeps;
3952 resetUnscheduledDeps();
3953 MemoryDependencies.clear();
3954 ControlDependencies.clear();
3957 int unscheduledDepsInBundle()
const {
3958 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3960 for (
const ScheduleData *BundleMember =
this; BundleMember;
3961 BundleMember = BundleMember->NextInBundle) {
3962 if (BundleMember->UnscheduledDeps == InvalidDeps)
3964 Sum += BundleMember->UnscheduledDeps;
3970 if (!isSchedulingEntity()) {
3971 os <<
"/ " << *Inst;
3972 }
else if (NextInBundle) {
3974 ScheduleData *SD = NextInBundle;
3976 os <<
';' << *SD->Inst;
3977 SD = SD->NextInBundle;
3988 TreeEntry *
TE =
nullptr;
3992 ScheduleData *FirstInBundle =
nullptr;
3996 ScheduleData *NextInBundle =
nullptr;
4000 ScheduleData *NextLoadStore =
nullptr;
4014 int SchedulingRegionID = 0;
4017 int SchedulingPriority = 0;
4023 int Dependencies = InvalidDeps;
4029 int UnscheduledDeps = InvalidDeps;
4033 bool IsScheduled =
false;
4038 const BoUpSLP::ScheduleData &SD) {
4063 struct BlockScheduling {
4065 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4069 ScheduleStart =
nullptr;
4070 ScheduleEnd =
nullptr;
4071 FirstLoadStoreInRegion =
nullptr;
4072 LastLoadStoreInRegion =
nullptr;
4073 RegionHasStackSave =
false;
4077 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4080 ScheduleRegionSize = 0;
4084 ++SchedulingRegionID;
4088 if (BB !=
I->getParent())
4091 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4092 if (SD && isInSchedulingRegion(SD))
4097 ScheduleData *getScheduleData(
Value *V) {
4098 if (
auto *
I = dyn_cast<Instruction>(V))
4099 return getScheduleData(
I);
4103 bool isInSchedulingRegion(ScheduleData *SD)
const {
4104 return SD->SchedulingRegionID == SchedulingRegionID;
4109 template <
typename ReadyListType>
4110 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4111 SD->IsScheduled =
true;
4114 for (ScheduleData *BundleMember = SD; BundleMember;
4115 BundleMember = BundleMember->NextInBundle) {
4120 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4121 ScheduleData *OpDef = getScheduleData(
I);
4122 if (OpDef && OpDef->hasValidDependencies() &&
4123 OpDef->incrementUnscheduledDeps(-1) == 0) {
4127 ScheduleData *DepBundle = OpDef->FirstInBundle;
4128 assert(!DepBundle->IsScheduled &&
4129 "already scheduled bundle gets ready");
4130 ReadyList.insert(DepBundle);
4132 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4139 if (TreeEntry *TE = BundleMember->TE) {
4141 int Lane = std::distance(
TE->Scalars.begin(),
4142 find(
TE->Scalars, BundleMember->Inst));
4143 assert(Lane >= 0 &&
"Lane not set");
4151 auto *
In = BundleMember->Inst;
4154 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4155 In->getNumOperands() ==
TE->getNumOperands()) &&
4156 "Missed TreeEntry operands?");
4159 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4160 OpIdx != NumOperands; ++OpIdx)
4161 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4166 for (
Use &U : BundleMember->Inst->operands())
4167 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4171 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4172 if (MemoryDepSD->hasValidDependencies() &&
4173 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4176 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4177 assert(!DepBundle->IsScheduled &&
4178 "already scheduled bundle gets ready");
4179 ReadyList.insert(DepBundle);
4181 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4185 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4186 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4189 ScheduleData *DepBundle = DepSD->FirstInBundle;
4190 assert(!DepBundle->IsScheduled &&
4191 "already scheduled bundle gets ready");
4192 ReadyList.insert(DepBundle);
4194 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4205 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4206 ScheduleStart->comesBefore(ScheduleEnd) &&
4207 "Not a valid scheduling region?");
4209 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4210 auto *SD = getScheduleData(
I);
4213 assert(isInSchedulingRegion(SD) &&
4214 "primary schedule data not in window?");
4215 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4216 "entire bundle in window!");
4220 for (
auto *SD : ReadyInsts) {
4221 assert(SD->isSchedulingEntity() && SD->isReady() &&
4222 "item in ready list not ready?");
4228 template <
typename ReadyListType>
4229 void initialFillReadyList(ReadyListType &ReadyList) {
4230 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4231 ScheduleData *SD = getScheduleData(
I);
4232 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4234 ReadyList.insert(SD);
4236 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4250 std::optional<ScheduleData *>
4252 const InstructionsState &S);
4258 ScheduleData *allocateScheduleDataChunks();
4262 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4267 ScheduleData *PrevLoadStore,
4268 ScheduleData *NextLoadStore);
4272 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4276 void resetSchedule();
4306 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4310 ScheduleData *LastLoadStoreInRegion =
nullptr;
4315 bool RegionHasStackSave =
false;
4318 int ScheduleRegionSize = 0;
4327 int SchedulingRegionID = 1;
4335 void scheduleBlock(BlockScheduling *BS);
4342 struct OrdersTypeDenseMapInfo {
4355 static unsigned getHashValue(
const OrdersType &V) {
4376 unsigned MaxVecRegSize;
4377 unsigned MinVecRegSize;
4392 unsigned ReductionBitWidth = 0;
4395 unsigned BaseGraphSize = 1;
4399 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4418 struct ChildIteratorType
4420 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4431 return R.VectorizableTree[0].get();
4435 return {
N->UserTreeIndices.begin(),
N->Container};
4439 return {
N->UserTreeIndices.end(),
N->Container};
4444 class nodes_iterator {
4455 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4459 return nodes_iterator(R->VectorizableTree.begin());
4463 return nodes_iterator(R->VectorizableTree.end());
4466 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4477 OS << Entry->Idx <<
".\n";
4480 for (
auto *V : Entry->Scalars) {
4482 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4483 return EU.Scalar == V;
4493 if (Entry->isGather())
4495 if (Entry->State == TreeEntry::ScatterVectorize ||
4496 Entry->State == TreeEntry::StridedVectorize)
4497 return "color=blue";
4506 for (
auto *
I : DeletedInstructions) {
4507 if (!
I->getParent()) {
4510 if (isa<PHINode>(
I))
4512 I->insertBefore(
F->getEntryBlock(),
4513 F->getEntryBlock().getFirstNonPHIIt());
4515 I->insertBefore(
F->getEntryBlock().getTerminator());
4518 for (
Use &U :
I->operands()) {
4519 auto *
Op = dyn_cast<Instruction>(U.get());
4520 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4524 I->dropAllReferences();
4526 for (
auto *
I : DeletedInstructions) {
4528 "trying to erase instruction with users.");
4529 I->eraseFromParent();
4535#ifdef EXPENSIVE_CHECKS
4546 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4547 "Expected non-empty mask.");
4550 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4552 Reuses[Mask[
I]] = Prev[
I];
4560 bool BottomOrder =
false) {
4561 assert(!Mask.empty() &&
"Expected non-empty mask.");
4562 unsigned Sz = Mask.size();
4565 if (Order.
empty()) {
4567 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4569 PrevOrder.
swap(Order);
4572 for (
unsigned I = 0;
I < Sz; ++
I)
4574 Order[
I] = PrevOrder[Mask[
I]];
4576 return Data.value() == Sz ||
Data.index() ==
Data.value();
4585 if (Order.
empty()) {
4587 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4597 for (
unsigned I = 0;
I < Sz; ++
I)
4599 Order[MaskOrder[
I]] =
I;
4603std::optional<BoUpSLP::OrdersType>
4605 assert(TE.isGather() &&
"Expected gather node only.");
4609 Type *ScalarTy = GatheredScalars.
front()->getType();
4610 int NumScalars = GatheredScalars.
size();
4612 return std::nullopt;
4615 if (NumParts == 0 || NumParts >= NumScalars ||
4616 VecTy->getNumElements() % NumParts != 0 ||
4618 VecTy->getNumElements() / NumParts))
4624 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4626 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4629 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4630 return std::nullopt;
4631 OrdersType CurrentOrder(NumScalars, NumScalars);
4632 if (GatherShuffles.
size() == 1 &&
4634 Entries.front().front()->isSame(TE.Scalars)) {
4637 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4638 return CurrentOrder;
4642 return all_of(Mask, [&](
int I) {
4649 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4650 (Entries.size() != 1 ||
4651 Entries.front().front()->ReorderIndices.empty())) ||
4652 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4653 return std::nullopt;
4658 for (
int I : seq<int>(0, NumParts)) {
4659 if (ShuffledSubMasks.
test(
I))
4661 const int VF = GetVF(
I);
4667 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4668 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4669 ShuffledSubMasks.
set(
I);
4673 int FirstMin = INT_MAX;
4674 int SecondVecFound =
false;
4675 for (
int K : seq<int>(Limit)) {
4676 int Idx = Mask[
I * PartSz + K];
4678 Value *V = GatheredScalars[
I * PartSz + K];
4680 SecondVecFound =
true;
4689 SecondVecFound =
true;
4693 FirstMin = (FirstMin / PartSz) * PartSz;
4695 if (SecondVecFound) {
4696 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4697 ShuffledSubMasks.
set(
I);
4700 for (
int K : seq<int>(Limit)) {
4701 int Idx = Mask[
I * PartSz + K];
4705 if (
Idx >= PartSz) {
4706 SecondVecFound =
true;
4709 if (CurrentOrder[
I * PartSz +
Idx] >
4710 static_cast<unsigned>(
I * PartSz + K) &&
4711 CurrentOrder[
I * PartSz +
Idx] !=
4712 static_cast<unsigned>(
I * PartSz +
Idx))
4713 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4716 if (SecondVecFound) {
4717 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4718 ShuffledSubMasks.
set(
I);
4724 if (!ExtractShuffles.
empty())
4725 TransformMaskToOrder(
4726 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4727 if (!ExtractShuffles[
I])
4730 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4731 for (
unsigned Idx : seq<unsigned>(Sz)) {
4732 int K =
I * PartSz +
Idx;
4735 if (!TE.ReuseShuffleIndices.empty())
4736 K = TE.ReuseShuffleIndices[K];
4739 if (!TE.ReorderIndices.empty())
4740 K = std::distance(TE.ReorderIndices.begin(),
4741 find(TE.ReorderIndices, K));
4742 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4745 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4747 .getKnownMinValue());
4752 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4753 if (ShuffledSubMasks.
any())
4754 return std::nullopt;
4755 PartSz = NumScalars;
4758 if (!Entries.empty())
4759 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4760 if (!GatherShuffles[
I])
4762 return std::max(Entries[
I].front()->getVectorFactor(),
4763 Entries[
I].back()->getVectorFactor());
4766 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4767 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4768 return std::nullopt;
4769 return std::move(CurrentOrder);
4774 bool CompareOpcodes =
true) {
4778 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4779 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4780 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4781 (!GEP2 || GEP2->getNumOperands() == 2) &&
4782 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4783 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4786 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4790template <
typename T>
4792 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4794 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4795 return CommonAlignment;
4801 "Order is empty. Please check it before using isReverseOrder.");
4802 unsigned Sz = Order.
size();
4804 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4815static std::optional<Value *>
4821 const SCEV *PtrSCEVLowest =
nullptr;
4822 const SCEV *PtrSCEVHighest =
nullptr;
4828 return std::nullopt;
4830 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4831 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4835 if (isa<SCEVCouldNotCompute>(Diff))
4836 return std::nullopt;
4838 PtrSCEVLowest = PtrSCEV;
4842 if (isa<SCEVCouldNotCompute>(Diff1))
4843 return std::nullopt;
4845 PtrSCEVHighest = PtrSCEV;
4851 if (isa<SCEVCouldNotCompute>(Dist))
4852 return std::nullopt;
4853 int Size =
DL.getTypeStoreSize(ElemTy);
4854 auto TryGetStride = [&](
const SCEV *Dist,
4855 const SCEV *Multiplier) ->
const SCEV * {
4856 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4857 if (M->getOperand(0) == Multiplier)
4858 return M->getOperand(1);
4859 if (M->getOperand(1) == Multiplier)
4860 return M->getOperand(0);
4863 if (Multiplier == Dist)
4868 const SCEV *Stride =
nullptr;
4869 if (
Size != 1 || SCEVs.
size() > 2) {
4871 Stride = TryGetStride(Dist, Sz);
4873 return std::nullopt;
4875 if (!Stride || isa<SCEVConstant>(Stride))
4876 return std::nullopt;
4879 using DistOrdPair = std::pair<int64_t, int>;
4881 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4883 bool IsConsecutive =
true;
4884 for (
const SCEV *PtrSCEV : SCEVs) {
4886 if (PtrSCEV != PtrSCEVLowest) {
4888 const SCEV *Coeff = TryGetStride(Diff, Stride);
4890 return std::nullopt;
4891 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4892 if (!SC || isa<SCEVCouldNotCompute>(SC))
4893 return std::nullopt;
4897 return std::nullopt;
4898 Dist = SC->getAPInt().getZExtValue();
4902 return std::nullopt;
4903 auto Res = Offsets.emplace(Dist, Cnt);
4905 return std::nullopt;
4907 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4910 if (Offsets.size() != SCEVs.
size())
4911 return std::nullopt;
4912 SortedIndices.
clear();
4913 if (!IsConsecutive) {
4917 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4918 SortedIndices[Cnt] = Pair.second;
4928static std::pair<InstructionCost, InstructionCost>
4944 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4947 Mask, NumSrcElts, NumSubElts,
Index)) {
4948 if (
Index + NumSubElts > NumSrcElts &&
4949 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4962 unsigned *BestVF,
bool TryRecursiveCheck)
const {
4975 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4981 const unsigned Sz = VL.
size();
4983 auto *POIter = PointerOps.
begin();
4984 for (
Value *V : VL) {
4985 auto *L = dyn_cast<LoadInst>(V);
4986 if (!L || !L->isSimple())
4988 *POIter = L->getPointerOperand();
4997 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5017 if (Order.
empty()) {
5018 Ptr0 = PointerOps.
front();
5019 PtrN = PointerOps.
back();
5021 Ptr0 = PointerOps[Order.
front()];
5022 PtrN = PointerOps[Order.
back()];
5024 std::optional<int> Diff =
5027 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5033 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5047 auto IsAnyPointerUsedOutGraph =
5048 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5049 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5050 return !getTreeEntry(U) && !MustGather.contains(U);
5053 const unsigned AbsoluteDiff = std::abs(*Diff);
5054 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5058 AbsoluteDiff > Sz) ||
5059 *Diff == -(
static_cast<int>(Sz) - 1))) {
5060 int Stride = *Diff /
static_cast<int>(Sz - 1);
5061 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5073 else if (
Ptr != Ptr0)
5077 if (((Dist / Stride) * Stride) != Dist ||
5078 !Dists.
insert(Dist).second)
5081 if (Dists.
size() == Sz)
5090 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5092 bool ProfitableGatherPointers) {
5097 auto [ScalarGEPCost, VectorGEPCost] =
5099 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5105 VecTy->getNumElements());
5106 if (
static_cast<unsigned>(
count_if(
5107 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5113 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5132 false, CommonAlignment,
CostKind) +
5133 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5140 constexpr unsigned ListLimit = 4;
5141 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5150 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5155 for (
unsigned VF = VL.
size() / 2; VF >= MinVF; VF /= 2) {
5157 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5170 DemandedElts.
setBits(Cnt, Cnt + VF);
5185 if (!DemandedElts.
isZero()) {
5190 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5191 if (DemandedElts[
Idx])
5198 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5203 LI0->getPointerOperand(),
5204 Instruction::GetElementPtr,
CostKind, ScalarTy,
5208 if (
static_cast<unsigned>(
5209 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5210 PointerOps.
size() - 1 ||
5230 LI0->getPointerAddressSpace(),
CostKind,
5236 LI0->getPointerOperand(),
5243 LI0->getPointerOperand(),
5253 for (
int Idx : seq<int>(0, VL.
size()))
5263 if (MaskedGatherCost >= VecLdCost &&
5276 bool ProfitableGatherPointers =
5277 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5278 return L->isLoopInvariant(V);
5280 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5281 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5283 (
GEP &&
GEP->getNumOperands() == 2 &&
5284 isa<Constant, Instruction>(
GEP->getOperand(1)));
5291 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5292 ProfitableGatherPointers))
5305 "Expected list of pointer operands.");
5315 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5317 SortedIndices.
clear();
5319 auto Key = std::make_pair(BBs[Cnt + 1],
5323 std::optional<int> Diff = getPointersDiff(
5324 ElemTy, std::get<0>(Base.front()), ElemTy,
5330 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5336 if (Bases.
size() > VL.
size() / 2 - 1)
5340 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5347 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5348 Bases.
front().second.size() == VL.
size()))
5353 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5363 FirstPointers.
insert(P1);
5364 SecondPointers.
insert(P2);
5370 "Unable to find matching root.");
5373 for (
auto &
Base : Bases) {
5374 for (
auto &Vec :
Base.second) {
5375 if (Vec.size() > 1) {
5376 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5377 const std::tuple<Value *, int, unsigned> &
Y) {
5378 return std::get<1>(
X) < std::get<1>(
Y);
5380 int InitialOffset = std::get<1>(Vec[0]);
5381 bool AnyConsecutive =
5383 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5387 if (!AnyConsecutive)
5392 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5396 for (
auto &
T : Bases)
5397 for (
const auto &Vec :
T.second)
5398 for (
const auto &
P : Vec)
5402 "Expected SortedIndices to be the size of VL");
5406std::optional<BoUpSLP::OrdersType>
5408 assert(TE.isGather() &&
"Expected gather node only.");
5409 Type *ScalarTy = TE.Scalars[0]->getType();
5412 Ptrs.
reserve(TE.Scalars.size());
5414 BBs.
reserve(TE.Scalars.size());
5415 for (
Value *V : TE.Scalars) {
5416 auto *L = dyn_cast<LoadInst>(V);
5417 if (!L || !L->isSimple())
5418 return std::nullopt;
5424 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5426 return std::move(Order);
5427 return std::nullopt;
5438 if (VU->
getType() != V->getType())
5441 if (!VU->
hasOneUse() && !V->hasOneUse())
5447 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5453 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5454 bool IsReusedIdx =
false;
5456 if (IE2 == VU && !IE1)
5458 if (IE1 == V && !IE2)
5459 return V->hasOneUse();
5460 if (IE1 && IE1 != V) {
5462 IsReusedIdx |= ReusedIdx.
test(Idx1);
5463 ReusedIdx.
set(Idx1);
5464 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5467 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5469 if (IE2 && IE2 != VU) {
5471 IsReusedIdx |= ReusedIdx.
test(Idx2);
5472 ReusedIdx.
set(Idx2);
5473 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5476 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5478 }
while (!IsReusedIdx && (IE1 || IE2));
5482std::optional<BoUpSLP::OrdersType>
5486 if (!TE.ReuseShuffleIndices.empty()) {
5488 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5489 "Reshuffling scalars not yet supported for nodes with padding");
5492 return std::nullopt;
5500 unsigned Sz = TE.Scalars.size();
5501 if (TE.isGather()) {
5502 if (std::optional<OrdersType> CurrentOrder =
5507 ::addMask(Mask, TE.ReuseShuffleIndices);
5508 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5509 unsigned Sz = TE.Scalars.size();
5510 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5513 Res[
Idx + K * Sz] =
I + K * Sz;
5515 return std::move(Res);
5518 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5520 2 * TE.getVectorFactor())) == 1)
5521 return std::nullopt;
5525 if (TE.ReorderIndices.empty())
5526 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5529 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5530 unsigned VF = ReorderMask.
size();
5534 for (
unsigned I = 0;
I < VF;
I += Sz) {
5536 unsigned UndefCnt = 0;
5537 unsigned Limit = std::min(Sz, VF -
I);
5546 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5548 return std::nullopt;
5550 for (
unsigned K = 0; K < NumParts; ++K) {
5551 unsigned Idx = Val + Sz * K;
5553 ResOrder[
Idx] =
I + K;
5556 return std::move(ResOrder);
5558 unsigned VF = TE.getVectorFactor();
5561 TE.ReuseShuffleIndices.end());
5562 if (TE.getOpcode() == Instruction::ExtractElement &&
5564 if (isa<PoisonValue>(V))
5566 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5567 return Idx && *Idx < Sz;
5569 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5570 "by BinaryOperator and CastInst.");
5572 if (TE.ReorderIndices.empty())
5573 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5576 for (
unsigned I = 0;
I < VF; ++
I) {
5577 int &
Idx = ReusedMask[
I];
5580 Value *V = TE.Scalars[ReorderMask[
Idx]];
5582 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5588 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5589 auto *It = ResOrder.
begin();
5590 for (
unsigned K = 0; K < VF; K += Sz) {
5594 std::iota(SubMask.begin(), SubMask.end(), 0);
5596 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5597 std::advance(It, Sz);
5600 return Data.index() ==
Data.value();
5602 return std::nullopt;
5603 return std::move(ResOrder);
5605 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5606 any_of(TE.UserTreeIndices,
5608 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5610 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5611 return std::nullopt;
5612 if ((TE.State == TreeEntry::Vectorize ||
5613 TE.State == TreeEntry::StridedVectorize) &&
5614 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5615 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5616 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5617 "BinaryOperator and CastInst.");
5618 return TE.ReorderIndices;
5620 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5621 if (!TE.ReorderIndices.empty())
5622 return TE.ReorderIndices;
5625 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5626 if (!V->hasNUsesOrMore(1))
5628 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5633 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5635 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5641 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5642 auto *NodeA = DT->
getNode(BB1);
5643 auto *NodeB = DT->
getNode(BB2);
5644 assert(NodeA &&
"Should only process reachable instructions");
5645 assert(NodeB &&
"Should only process reachable instructions");
5646 assert((NodeA == NodeB) ==
5647 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5648 "Different nodes should have different DFS numbers");
5649 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5651 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5652 Value *V1 = TE.Scalars[I1];
5653 Value *V2 = TE.Scalars[I2];
5654 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5656 if (isa<PoisonValue>(V1))
5658 if (isa<PoisonValue>(V2))
5664 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5665 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5666 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5667 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5668 FirstUserOfPhi2->getParent());
5669 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5670 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5671 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5672 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5678 if (UserBVHead[I1] && !UserBVHead[I2])
5680 if (!UserBVHead[I1])
5682 if (UserBVHead[I1] == UserBVHead[I2])
5685 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5687 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5694 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5695 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5696 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5697 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5700 if (EE1->getOperand(0) == EE2->getOperand(0))
5702 if (!Inst1 && Inst2)
5704 if (Inst1 && Inst2) {
5712 "Expected either instructions or arguments vector operands.");
5713 return P1->getArgNo() < P2->getArgNo();
5718 std::iota(Phis.
begin(), Phis.
end(), 0);
5721 return std::nullopt;
5722 return std::move(Phis);
5724 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5727 if ((TE.getOpcode() == Instruction::ExtractElement ||
5728 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5729 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5731 auto *EE = dyn_cast<ExtractElementInst>(V);
5732 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5737 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5739 if (Reuse || !CurrentOrder.
empty())
5740 return std::move(CurrentOrder);
5748 int Sz = TE.Scalars.size();
5750 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5752 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5753 if (It == TE.Scalars.begin())
5756 if (It != TE.Scalars.end()) {
5758 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5773 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5776 return std::move(Order);
5781 return std::nullopt;
5782 if (TE.Scalars.size() >= 3)
5787 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5791 CurrentOrder, PointerOps);
5793 return std::move(CurrentOrder);
5799 return CurrentOrder;
5801 return std::nullopt;
5811 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5813 if (Cluster != FirstCluster)
5819void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5822 const unsigned Sz =
TE.Scalars.size();
5824 if (!
TE.isGather() ||
5831 addMask(NewMask,
TE.ReuseShuffleIndices);
5833 TE.ReorderIndices.clear();
5840 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5841 *
End =
TE.ReuseShuffleIndices.end();
5842 It !=
End; std::advance(It, Sz))
5843 std::iota(It, std::next(It, Sz), 0);
5849 "Expected same size of orders");
5850 unsigned Sz = Order.
size();
5852 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5853 if (Order[
Idx] != Sz)
5854 UsedIndices.
set(Order[
Idx]);
5856 if (SecondaryOrder.
empty()) {
5857 for (
unsigned Idx : seq<unsigned>(0, Sz))
5858 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5861 for (
unsigned Idx : seq<unsigned>(0, Sz))
5862 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5863 !UsedIndices.
test(SecondaryOrder[
Idx]))
5864 Order[
Idx] = SecondaryOrder[
Idx];
5884 ExternalUserReorderMap;
5889 const std::unique_ptr<TreeEntry> &TE) {
5892 findExternalStoreUsersReorderIndices(TE.get());
5893 if (!ExternalUserReorderIndices.
empty()) {
5894 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5896 std::move(ExternalUserReorderIndices));
5902 if (TE->isAltShuffle()) {
5905 unsigned Opcode0 = TE->getOpcode();
5906 unsigned Opcode1 = TE->getAltOpcode();
5909 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5910 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5916 if (std::optional<OrdersType> CurrentOrder =
5926 const TreeEntry *UserTE = TE.get();
5928 if (UserTE->UserTreeIndices.size() != 1)
5931 return EI.UserTE->State == TreeEntry::Vectorize &&
5932 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5935 UserTE = UserTE->UserTreeIndices.back().UserTE;
5938 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5939 if (!(TE->State == TreeEntry::Vectorize ||
5940 TE->State == TreeEntry::StridedVectorize) ||
5941 !TE->ReuseShuffleIndices.empty())
5942 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5943 if (TE->State == TreeEntry::Vectorize &&
5944 TE->getOpcode() == Instruction::PHI)
5945 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5950 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
5951 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5952 auto It = VFToOrderedEntries.
find(VF);
5953 if (It == VFToOrderedEntries.
end())
5968 for (
const TreeEntry *OpTE : OrderedEntries) {
5971 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5974 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5976 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5977 auto It = GathersToOrders.find(OpTE);
5978 if (It != GathersToOrders.end())
5981 if (OpTE->isAltShuffle()) {
5982 auto It = AltShufflesToOrders.find(OpTE);
5983 if (It != AltShufflesToOrders.end())
5986 if (OpTE->State == TreeEntry::Vectorize &&
5987 OpTE->getOpcode() == Instruction::PHI) {
5988 auto It = PhisToOrders.
find(OpTE);
5989 if (It != PhisToOrders.
end())
5992 return OpTE->ReorderIndices;
5995 auto It = ExternalUserReorderMap.
find(OpTE);
5996 if (It != ExternalUserReorderMap.
end()) {
5997 const auto &ExternalUserReorderIndices = It->second;
6001 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6002 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6003 ExternalUserReorderIndices.size();
6005 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
6006 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6013 if (OpTE->State == TreeEntry::Vectorize &&
6014 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6015 assert(!OpTE->isAltShuffle() &&
6016 "Alternate instructions are only supported by BinaryOperator "
6020 unsigned E = Order.size();
6023 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6026 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6028 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6031 if (OrdersUses.empty())
6034 unsigned IdentityCnt = 0;
6035 unsigned FilledIdentityCnt = 0;
6037 for (
auto &Pair : OrdersUses) {
6039 if (!Pair.first.empty())
6040 FilledIdentityCnt += Pair.second;
6041 IdentityCnt += Pair.second;
6046 unsigned Cnt = IdentityCnt;
6047 for (
auto &Pair : OrdersUses) {
6051 if (Cnt < Pair.second ||
6052 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6053 Cnt == Pair.second && !BestOrder.
empty() &&
6056 BestOrder = Pair.first;
6069 unsigned E = BestOrder.
size();
6071 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6074 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6076 if (TE->Scalars.size() != VF) {
6077 if (TE->ReuseShuffleIndices.size() == VF) {
6083 return EI.UserTE->Scalars.size() == VF ||
6084 EI.UserTE->Scalars.size() ==
6087 "All users must be of VF size.");
6095 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6100 return isa<ShuffleVectorInst>(
6101 EI.UserTE->getMainOp());
6103 "Does not know how to reorder.");
6107 reorderNodeWithReuses(*TE, Mask);
6111 if ((TE->State == TreeEntry::Vectorize ||
6112 TE->State == TreeEntry::StridedVectorize) &&
6115 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6116 assert(!TE->isAltShuffle() &&
6117 "Alternate instructions are only supported by BinaryOperator "
6122 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6123 TE->reorderOperands(Mask);
6126 TE->reorderOperands(Mask);
6127 assert(TE->ReorderIndices.empty() &&
6128 "Expected empty reorder sequence.");
6131 if (!TE->ReuseShuffleIndices.empty()) {
6138 addMask(NewReuses, TE->ReuseShuffleIndices);
6139 TE->ReuseShuffleIndices.swap(NewReuses);
6145bool BoUpSLP::canReorderOperands(
6146 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6149 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6150 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6151 return OpData.first ==
I &&
6152 (OpData.second->State == TreeEntry::Vectorize ||
6153 OpData.second->State == TreeEntry::StridedVectorize);
6156 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6158 if (
any_of(TE->UserTreeIndices,
6159 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6163 Edges.emplace_back(
I, TE);
6169 if (TE->State != TreeEntry::Vectorize &&
6170 TE->State != TreeEntry::StridedVectorize &&
6171 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6175 TreeEntry *
Gather =
nullptr;
6177 [&
Gather, UserTE,
I](TreeEntry *TE) {
6178 assert(TE->State != TreeEntry::Vectorize &&
6179 TE->State != TreeEntry::StridedVectorize &&
6180 "Only non-vectorized nodes are expected.");
6181 if (
any_of(TE->UserTreeIndices,
6182 [UserTE,
I](
const EdgeInfo &EI) {
6183 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6185 assert(TE->isSame(UserTE->getOperand(
I)) &&
6186 "Operand entry does not match operands.");
6207 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6208 if (TE->State != TreeEntry::Vectorize &&
6209 TE->State != TreeEntry::StridedVectorize)
6211 if (std::optional<OrdersType> CurrentOrder =
6213 OrderedEntries.
insert(TE.get());
6214 if (!(TE->State == TreeEntry::Vectorize ||
6215 TE->State == TreeEntry::StridedVectorize) ||
6216 !TE->ReuseShuffleIndices.empty())
6217 GathersToOrders.
insert(TE.get());
6226 while (!OrderedEntries.
empty()) {
6231 for (TreeEntry *TE : OrderedEntries) {
6232 if (!(TE->State == TreeEntry::Vectorize ||
6233 TE->State == TreeEntry::StridedVectorize ||
6234 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6235 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6238 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6240 !Visited.
insert(TE).second) {
6246 for (
EdgeInfo &EI : TE->UserTreeIndices)
6250 for (TreeEntry *TE : Filtered)
6251 OrderedEntries.remove(TE);
6253 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6255 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6256 return Data1.first->Idx > Data2.first->Idx;
6258 for (
auto &
Data : UsersVec) {
6261 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6263 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6264 OrderedEntries.remove(
Op.second);
6277 for (
const auto &
Op :
Data.second) {
6278 TreeEntry *OpTE =
Op.second;
6279 if (!VisitedOps.
insert(OpTE).second)
6281 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6283 const auto Order = [&]() ->
const OrdersType {
6284 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6287 return OpTE->ReorderIndices;
6291 if (Order.size() == 1)
6294 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6295 return P.second == OpTE;
6298 if (OpTE->State == TreeEntry::Vectorize &&
6299 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6300 assert(!OpTE->isAltShuffle() &&
6301 "Alternate instructions are only supported by BinaryOperator "
6305 unsigned E = Order.size();
6308 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6311 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6314 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6316 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6317 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6318 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6319 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6320 (IgnoreReorder && TE->Idx == 0))
6322 if (TE->isGather()) {
6331 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6332 TreeEntry *UserTE = EI.
UserTE;
6333 if (!VisitedUsers.
insert(UserTE).second)
6338 if (AllowsReordering(UserTE))
6346 if (
static_cast<unsigned>(
count_if(
6347 Ops, [UserTE, &AllowsReordering](
6348 const std::pair<unsigned, TreeEntry *> &
Op) {
6349 return AllowsReordering(
Op.second) &&
6352 return EI.UserTE == UserTE;
6354 })) <= Ops.
size() / 2)
6355 ++Res.first->second;
6358 if (OrdersUses.empty()) {
6359 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6360 OrderedEntries.remove(
Op.second);
6364 unsigned IdentityCnt = 0;
6365 unsigned VF =
Data.second.front().second->getVectorFactor();
6367 for (
auto &Pair : OrdersUses) {
6369 IdentityCnt += Pair.second;
6374 unsigned Cnt = IdentityCnt;
6375 for (
auto &Pair : OrdersUses) {
6379 if (Cnt < Pair.second) {
6381 BestOrder = Pair.first;
6389 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6390 OrderedEntries.remove(
Op.second);
6399 unsigned E = BestOrder.
size();
6401 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6403 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6404 TreeEntry *TE =
Op.second;
6405 OrderedEntries.remove(TE);
6406 if (!VisitedOps.
insert(TE).second)
6408 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6409 reorderNodeWithReuses(*TE, Mask);
6413 if (TE->State != TreeEntry::Vectorize &&
6414 TE->State != TreeEntry::StridedVectorize &&
6415 (TE->State != TreeEntry::ScatterVectorize ||
6416 TE->ReorderIndices.empty()))
6418 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6419 TE->ReorderIndices.empty()) &&
6420 "Non-matching sizes of user/operand entries.");
6422 if (IgnoreReorder && TE == VectorizableTree.front().get())
6423 IgnoreReorder =
false;
6426 for (TreeEntry *
Gather : GatherOps) {
6428 "Unexpected reordering of gathers.");
6429 if (!
Gather->ReuseShuffleIndices.empty()) {
6435 OrderedEntries.remove(
Gather);
6439 if (
Data.first->State != TreeEntry::Vectorize ||
6440 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6441 Data.first->getMainOp()) ||
6442 Data.first->isAltShuffle())
6443 Data.first->reorderOperands(Mask);
6444 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6445 Data.first->isAltShuffle() ||
6446 Data.first->State == TreeEntry::StridedVectorize) {
6450 if (
Data.first->ReuseShuffleIndices.empty() &&
6451 !
Data.first->ReorderIndices.empty() &&
6452 !
Data.first->isAltShuffle()) {
6455 OrderedEntries.insert(
Data.first);
6463 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6464 VectorizableTree.front()->ReuseShuffleIndices.empty())
6465 VectorizableTree.front()->ReorderIndices.clear();
6468Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6469 if ((Entry.getOpcode() == Instruction::Store ||
6470 Entry.getOpcode() == Instruction::Load) &&
6471 Entry.State == TreeEntry::StridedVectorize &&
6472 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6473 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6474 return dyn_cast<Instruction>(Entry.Scalars.front());
6481 for (
auto &TEPtr : VectorizableTree) {
6482 TreeEntry *Entry = TEPtr.get();
6485 if (Entry->isGather())
6489 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6490 Value *Scalar = Entry->Scalars[Lane];
6491 if (!isa<Instruction>(Scalar))
6494 auto It = ScalarToExtUses.
find(Scalar);
6495 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6499 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6500 if (ExtI != ExternallyUsedValues.
end()) {
6501 int FoundLane = Entry->findLaneForValue(Scalar);
6502 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6503 << FoundLane <<
" from " << *Scalar <<
".\n");
6504 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6505 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6508 for (
User *U : Scalar->users()) {
6516 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6520 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6524 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6526 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6527 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6529 assert(!UseEntry->isGather() &&
"Bad state");
6533 if (It != ScalarToExtUses.
end()) {
6534 ExternalUses[It->second].User =
nullptr;
6539 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6541 int FoundLane = Entry->findLaneForValue(Scalar);
6543 <<
" from lane " << FoundLane <<
" from " << *Scalar
6545 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6546 ExternalUses.emplace_back(Scalar, U, FoundLane);
6555BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6559 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6560 Value *V = TE->Scalars[Lane];
6562 if (!isa<Instruction>(V))
6569 for (
User *U : V->users()) {
6570 auto *SI = dyn_cast<StoreInst>(U);
6573 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6577 if (getTreeEntry(U))
6582 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6583 SI->getValueOperand()->getType(),
Ptr}];
6586 if (StoresVec.size() > Lane)
6588 if (!StoresVec.empty()) {
6590 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6591 SI->getValueOperand()->getType(),
6592 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6598 StoresVec.push_back(SI);
6603 for (
auto &
P : PtrToStoresMap) {
6604 Res[
I].swap(
P.second);
6611 OrdersType &ReorderIndices)
const {
6622 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6624 std::optional<int> Diff =
6626 SI->getPointerOperand(), *
DL, *SE,
6632 if (StoreOffsetVec.
size() != StoresVec.
size())
6634 sort(StoreOffsetVec,
6635 [](
const std::pair<int, unsigned> &L,
6636 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6639 for (
const auto &
P : StoreOffsetVec) {
6640 if (
Idx > 0 &&
P.first != PrevDist + 1)
6648 ReorderIndices.assign(StoresVec.
size(), 0);
6649 bool IsIdentity =
true;
6651 ReorderIndices[
P.second] =
I;
6652 IsIdentity &=
P.second ==
I;
6658 ReorderIndices.clear();
6665 for (
unsigned Idx : Order)
6672BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6673 unsigned NumLanes =
TE->Scalars.size();
6686 if (StoresVec.
size() != NumLanes)
6691 if (!canFormVector(StoresVec, ReorderIndices))
6696 ExternalReorderIndices.
push_back(ReorderIndices);
6698 return ExternalReorderIndices;
6704 UserIgnoreList = &UserIgnoreLst;
6707 buildTree_rec(Roots, 0,
EdgeInfo());
6714 buildTree_rec(Roots, 0,
EdgeInfo());
6723 bool AddNew =
true) {
6731 for (
Value *V : VL) {
6732 auto *LI = dyn_cast<LoadInst>(V);
6735 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6737 bool IsFound =
false;
6738 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6739 assert(LI->getParent() ==
Data.front().first->getParent() &&
6740 LI->getType() ==
Data.front().first->getType() &&
6744 "Expected loads with the same type, same parent and same "
6745 "underlying pointer.");
6747 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6748 Data.front().first->getPointerOperand(),
DL, SE,
6752 auto It = Map.find(*Dist);
6753 if (It != Map.end() && It->second != LI)
6755 if (It == Map.end()) {
6756 Data.emplace_back(LI, *Dist);
6757 Map.try_emplace(*Dist, LI);
6767 auto FindMatchingLoads =
6772 int &
Offset,
unsigned &Start) {
6774 return GatheredLoads.
end();
6784 std::optional<int> Dist =
6786 Data.front().first->getType(),
6787 Data.front().first->getPointerOperand(),
DL, SE,
6793 for (std::pair<LoadInst *, int>
P :
Data) {
6799 unsigned NumUniques = 0;
6800 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6801 bool Used = DataLoads.
contains(Pair.first);
6802 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6806 Repeated.insert(Cnt);
6809 if (NumUniques > 0 &&
6810 (Loads.
size() == NumUniques ||
6811 (Loads.
size() - NumUniques >= 2 &&
6812 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6818 return std::next(GatheredLoads.
begin(),
Idx);
6822 return GatheredLoads.
end();
6824 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6828 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6830 while (It != GatheredLoads.
end()) {
6831 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6832 for (
unsigned Idx : LocalToAdd)
6834 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6835 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6839 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6843 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6852 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6853 return PD.front().first->getParent() == LI->
getParent() &&
6854 PD.front().first->getType() == LI->
getType();
6856 while (It != GatheredLoads.
end()) {
6859 std::next(It), GatheredLoads.
end(),
6860 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6861 return PD.front().first->getParent() == LI->getParent() &&
6862 PD.front().first->getType() == LI->getType();
6866 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6867 AddNewLoads(GatheredLoads.emplace_back());
6872void BoUpSLP::tryToVectorizeGatheredLoads(
6875 8> &GatheredLoads) {
6876 GatheredLoadsEntriesFirst = VectorizableTree.size();
6879 LoadEntriesToVectorize.
size());
6880 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6881 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6882 VectorizableTree[
Idx]->Scalars.end());
6885 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6886 const std::pair<LoadInst *, int> &L2) {
6887 return L1.second > L2.second;
6893 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6894 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6902 bool Final,
unsigned MaxVF) {
6904 unsigned StartIdx = 0;
6909 *
TTI, Loads.
front()->getType(), MaxVF);
6911 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6917 if (Final && CandidateVFs.
empty())
6920 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6921 for (
unsigned NumElts : CandidateVFs) {
6922 if (Final && NumElts > BestVF)
6925 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6929 if (VectorizedLoads.count(Slice.
front()) ||
6930 VectorizedLoads.count(Slice.
back()) ||
6936 bool AllowToVectorize =
false;
6944 if (LI->hasOneUse())
6950 if (
static_cast<unsigned int>(std::distance(
6951 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6953 if (!IsLegalBroadcastLoad)
6957 for (
User *U : LI->users()) {
6958 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
6960 if (
const TreeEntry *UTE = getTreeEntry(U)) {
6961 for (
int I : seq<int>(UTE->getNumOperands())) {
6962 if (
all_of(UTE->getOperand(
I),
6963 [LI](
Value *V) { return V == LI; }))
6972 AllowToVectorize = CheckIfAllowed(Slice);
6976 any_of(ValueToGatherNodes.at(Slice.front()),
6977 [=](
const TreeEntry *TE) {
6978 return TE->Scalars.size() == 2 &&
6979 ((TE->Scalars.front() == Slice.front() &&
6980 TE->Scalars.back() == Slice.back()) ||
6981 (TE->Scalars.front() == Slice.back() &&
6982 TE->Scalars.back() == Slice.front()));
6987 if (AllowToVectorize) {
6992 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
6994 PointerOps, &BestVF);
6996 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6998 if (MaskedGatherVectorized.
empty() ||
6999 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7004 Results.emplace_back(Values, LS);
7005 VectorizedLoads.insert(Slice.begin(), Slice.end());
7008 if (Cnt == StartIdx)
7009 StartIdx += NumElts;
7012 if (StartIdx >= Loads.
size())
7016 if (!MaskedGatherVectorized.
empty() &&
7017 Cnt < MaskedGatherVectorized.
back() + NumElts)
7023 if (!AllowToVectorize || BestVF == 0)
7027 for (
unsigned Cnt : MaskedGatherVectorized) {
7029 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7033 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7035 if (Cnt == StartIdx)
7036 StartIdx += NumElts;
7040 if (!VectorizedLoads.contains(LI))
7041 NonVectorized.push_back(LI);
7045 auto ProcessGatheredLoads =
7048 bool Final =
false) {
7050 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7051 if (LoadsDists.size() <= 1) {
7052 NonVectorized.
push_back(LoadsDists.back().first);
7058 LoadsDists, OriginalLoads.begin(),
7059 [](
const std::pair<LoadInst *, int> &L) { return L.first; });
7062 unsigned MaxConsecutiveDistance = 0;
7063 unsigned CurrentConsecutiveDist = 1;
7064 int LastDist = LocalLoadsDists.
front().second;
7065 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7066 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7067 if (getTreeEntry(
L.first))
7069 assert(LastDist >=
L.second &&
7070 "Expected first distance always not less than second");
7071 if (
static_cast<unsigned>(LastDist -
L.second) ==
7072 CurrentConsecutiveDist) {
7073 ++CurrentConsecutiveDist;
7074 MaxConsecutiveDistance =
7075 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7079 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7082 CurrentConsecutiveDist = 1;
7083 LastDist =
L.second;
7086 if (Loads.
size() <= 1)
7088 if (AllowMaskedGather)
7089 MaxConsecutiveDistance = Loads.
size();
7090 else if (MaxConsecutiveDistance < 2)
7095 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7096 Final, MaxConsecutiveDistance);
7098 OriginalLoads.size() == Loads.
size() &&
7099 MaxConsecutiveDistance == Loads.
size() &&
7104 VectorizedLoads.
clear();
7108 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7109 UnsortedNonVectorized, Final,
7110 OriginalLoads.size());
7111 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7112 SortedNonVectorized.
swap(UnsortedNonVectorized);
7113 Results.swap(UnsortedResults);
7118 << Slice.
size() <<
")\n");
7119 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7120 for (
Value *L : Slice)
7121 if (!getTreeEntry(L))
7122 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7128 unsigned MaxVF = Slice.size();
7129 unsigned UserMaxVF = 0;
7130 unsigned InterleaveFactor = 0;
7135 std::optional<unsigned> InterleavedLoadsDistance = 0;
7137 std::optional<unsigned> CommonVF = 0;
7141 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7142 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7145 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7147 if (*CommonVF == 0) {
7148 CommonVF =
E->Scalars.size();
7151 if (*CommonVF !=
E->Scalars.size())
7155 if (Pos !=
Idx && InterleavedLoadsDistance) {
7158 if (isa<Constant>(V))
7160 if (getTreeEntry(V))
7162 const auto &Nodes = ValueToGatherNodes.at(V);
7163 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7164 !is_contained(Slice, V);
7166 InterleavedLoadsDistance.reset();
7170 if (*InterleavedLoadsDistance == 0) {
7171 InterleavedLoadsDistance =
Idx - Pos;
7174 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7175 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7176 InterleavedLoadsDistance.reset();
7177 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7181 DeinterleavedNodes.
clear();
7183 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7184 CommonVF.value_or(0) != 0) {
7185 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7186 unsigned VF = *CommonVF;
7190 if (InterleaveFactor <= Slice.size() &&
7194 cast<LoadInst>(Slice.front())->getAlign(),
7195 cast<LoadInst>(Slice.front())
7199 UserMaxVF = InterleaveFactor * VF;
7201 InterleaveFactor = 0;
7206 unsigned ConsecutiveNodesSize = 0;
7207 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7208 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7209 [&, Slice = Slice](
const auto &
P) {
7211 return std::get<1>(
P).contains(V);
7213 if (It == Slice.end())
7216 VectorizableTree[std::get<0>(
P)]->Scalars;
7217 ConsecutiveNodesSize += VL.
size();
7218 unsigned Start = std::distance(Slice.begin(), It);
7219 unsigned Sz = Slice.size() - Start;
7220 return Sz < VL.
size() ||
7221 Slice.slice(std::distance(Slice.begin(), It),
7227 if (InterleaveFactor == 0 &&
7228 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7229 [&, Slice = Slice](
unsigned Idx) {
7231 SmallVector<Value *> PointerOps;
7232 return canVectorizeLoads(
7233 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7234 Slice[Idx * UserMaxVF], Order,
7236 LoadsState::ScatterVectorize;
7239 if (Slice.size() != ConsecutiveNodesSize)
7240 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7242 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7243 bool IsVectorized =
true;
7244 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7246 Slice.
slice(
I, std::min(VF,
E -
I));
7247 if (getTreeEntry(SubSlice.
front()))
7251 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7252 [&](
const auto &
P) {
7254 VectorizableTree[std::get<0>(
P)]
7259 unsigned Sz = VectorizableTree.size();
7260 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7261 if (Sz == VectorizableTree.size()) {
7262 IsVectorized =
false;
7265 if (InterleaveFactor > 0) {
7266 VF = 2 * (MaxVF / InterleaveFactor);
7267 InterleaveFactor = 0;
7276 NonVectorized.
append(SortedNonVectorized);
7278 return NonVectorized;
7280 for (
const auto &GLs : GatheredLoads) {
7281 const auto &
Ref = GLs.second;
7283 if (!
Ref.empty() && !NonVectorized.
empty() &&
7285 Ref.begin(),
Ref.end(), 0u,
7286 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
7287 return S + LoadsDists.size();
7288 }) != NonVectorized.
size() &&
7289 IsMaskedGatherSupported(NonVectorized)) {
7291 for (
LoadInst *LI : NonVectorized) {
7299 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7303 for (
unsigned Idx : LoadEntriesToVectorize) {
7304 const TreeEntry &
E = *VectorizableTree[
Idx];
7307 if (!
E.ReorderIndices.empty()) {
7314 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7318 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7319 VectorizableTree.size())
7320 GatheredLoadsEntriesFirst.reset();
7327 Value *NeedsScheduling =
nullptr;
7328 for (
Value *V : VL) {
7331 if (!NeedsScheduling) {
7332 NeedsScheduling = V;
7337 return NeedsScheduling;
7348 bool AllowAlternate) {
7352 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7355 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7360 if (isa<ExtractElementInst, UndefValue>(V))
7362 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7364 !isa<UndefValue>(EI->getIndexOperand()))
7367 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7370 if ((isa<BinaryOperator, CastInst>(
I)) &&
7380 : cast<CastInst>(
I)->getOperand(0)->getType()));
7382 if (isa<CastInst>(
I)) {
7383 std::pair<size_t, size_t> OpVals =
7389 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7391 if (CI->isCommutative())
7397 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7411 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7412 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7413 SubKey =
hash_value(Gep->getPointerOperand());
7417 !isa<ConstantInt>(
I->getOperand(1))) {
7425 return std::make_pair(Key, SubKey);
7435bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7437 unsigned Opcode0 = S.getOpcode();
7438 unsigned Opcode1 = S.getAltOpcode();
7442 Opcode0, Opcode1, OpcodeMask))
7445 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7448 for (
Value *V : VL) {
7449 if (isa<PoisonValue>(V)) {
7454 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7459 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7465 switch (Res.value_or(0)) {
7480 constexpr unsigned NumAltInsts = 3;
7481 unsigned NonInstCnt = 0;
7484 unsigned UndefCnt = 0;
7486 unsigned ExtraShuffleInsts = 0;
7495 return is_contained(Operands.back(), V);
7498 ++ExtraShuffleInsts;
7515 if (isa<Constant, ExtractElementInst>(V) ||
7516 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7517 if (isa<UndefValue>(V))
7523 if (!Res.second && Res.first->second == 1)
7524 ++ExtraShuffleInsts;
7525 ++Res.first->getSecond();
7526 if (
auto *
I = dyn_cast<Instruction>(V))
7527 UniqueOpcodes.
insert(
I->getOpcode());
7528 else if (Res.second)
7531 return none_of(Uniques, [&](
const auto &
P) {
7532 return P.first->hasNUsesOrMore(
P.second + 1) &&
7534 return getTreeEntry(U) || Uniques.contains(U);
7543 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7544 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7545 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7548BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7550 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7553 "Expected instructions with same/alternate opcodes only.");
7555 unsigned ShuffleOrOp =
7556 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7558 switch (ShuffleOrOp) {
7559 case Instruction::PHI: {
7562 return TreeEntry::NeedToGather;
7564 for (
Value *V : VL) {
7565 auto *
PHI = dyn_cast<PHINode>(V);
7570 if (Term &&
Term->isTerminator()) {
7572 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7573 return TreeEntry::NeedToGather;
7578 return TreeEntry::Vectorize;
7580 case Instruction::ExtractValue:
7581 case Instruction::ExtractElement: {
7582 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7585 return TreeEntry::NeedToGather;
7586 if (Reuse || !CurrentOrder.empty())
7587 return TreeEntry::Vectorize;
7589 return TreeEntry::NeedToGather;
7591 case Instruction::InsertElement: {
7595 for (
Value *V : VL) {
7596 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7598 "Non-constant or undef index?");
7602 return !SourceVectors.contains(V);
7605 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7606 "different source vectors.\n");
7607 return TreeEntry::NeedToGather;
7612 return SourceVectors.contains(V) && !
V->hasOneUse();
7615 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7616 "multiple uses.\n");
7617 return TreeEntry::NeedToGather;
7620 return TreeEntry::Vectorize;
7622 case Instruction::Load: {
7631 return TreeEntry::Vectorize;
7633 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7635 LoadEntriesToVectorize.insert(VectorizableTree.size());
7636 return TreeEntry::NeedToGather;
7638 return TreeEntry::ScatterVectorize;
7640 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7642 LoadEntriesToVectorize.insert(VectorizableTree.size());
7643 return TreeEntry::NeedToGather;
7645 return TreeEntry::StridedVectorize;
7649 if (
DL->getTypeSizeInBits(ScalarTy) !=
7650 DL->getTypeAllocSizeInBits(ScalarTy))
7651 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7653 auto *LI = dyn_cast<LoadInst>(V);
7654 return !LI || !LI->isSimple();
7661 return TreeEntry::NeedToGather;
7665 case Instruction::ZExt:
7666 case Instruction::SExt:
7667 case Instruction::FPToUI:
7668 case Instruction::FPToSI:
7669 case Instruction::FPExt:
7670 case Instruction::PtrToInt:
7671 case Instruction::IntToPtr:
7672 case Instruction::SIToFP:
7673 case Instruction::UIToFP:
7674 case Instruction::Trunc:
7675 case Instruction::FPTrunc:
7676 case Instruction::BitCast: {
7678 for (
Value *V : VL) {
7679 if (isa<PoisonValue>(V))
7681 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7684 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7685 return TreeEntry::NeedToGather;
7688 return TreeEntry::Vectorize;
7690 case Instruction::ICmp:
7691 case Instruction::FCmp: {
7696 for (
Value *V : VL) {
7697 if (isa<PoisonValue>(V))
7699 auto *
Cmp = cast<CmpInst>(V);
7700 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7701 Cmp->getOperand(0)->getType() != ComparedTy) {
7702 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7703 return TreeEntry::NeedToGather;
7706 return TreeEntry::Vectorize;
7708 case Instruction::Select:
7709 case Instruction::FNeg:
7710 case Instruction::Add:
7711 case Instruction::FAdd:
7712 case Instruction::Sub:
7713 case Instruction::FSub:
7714 case Instruction::Mul:
7715 case Instruction::FMul:
7716 case Instruction::UDiv:
7717 case Instruction::SDiv:
7718 case Instruction::FDiv:
7719 case Instruction::URem:
7720 case Instruction::SRem:
7721 case Instruction::FRem:
7722 case Instruction::Shl:
7723 case Instruction::LShr:
7724 case Instruction::AShr:
7725 case Instruction::And:
7726 case Instruction::Or:
7727 case Instruction::Xor:
7728 case Instruction::Freeze:
7729 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7731 auto *
I = dyn_cast<Instruction>(V);
7732 return I &&
I->isBinaryOp() && !
I->isFast();
7734 return TreeEntry::NeedToGather;
7735 return TreeEntry::Vectorize;
7736 case Instruction::GetElementPtr: {
7738 for (
Value *V : VL) {
7739 auto *
I = dyn_cast<GetElementPtrInst>(V);
7742 if (
I->getNumOperands() != 2) {
7743 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7744 return TreeEntry::NeedToGather;
7750 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7751 for (
Value *V : VL) {
7752 auto *
GEP = dyn_cast<GEPOperator>(V);
7755 Type *CurTy =
GEP->getSourceElementType();
7757 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7758 return TreeEntry::NeedToGather;
7764 for (
Value *V : VL) {
7765 auto *
I = dyn_cast<GetElementPtrInst>(V);
7768 auto *
Op =
I->getOperand(1);
7769 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7770 (
Op->getType() != Ty1 &&
7771 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7772 Op->getType()->getScalarSizeInBits() >
7773 DL->getIndexSizeInBits(
7774 V->getType()->getPointerAddressSpace())))) {
7776 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7777 return TreeEntry::NeedToGather;
7781 return TreeEntry::Vectorize;
7783 case Instruction::Store: {
7785 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7788 if (
DL->getTypeSizeInBits(ScalarTy) !=
7789 DL->getTypeAllocSizeInBits(ScalarTy)) {
7790 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7791 return TreeEntry::NeedToGather;
7795 for (
Value *V : VL) {
7796 auto *
SI = cast<StoreInst>(V);
7797 if (!
SI->isSimple()) {
7799 return TreeEntry::NeedToGather;
7808 if (CurrentOrder.empty()) {
7809 Ptr0 = PointerOps.
front();
7810 PtrN = PointerOps.
back();
7812 Ptr0 = PointerOps[CurrentOrder.front()];
7813 PtrN = PointerOps[CurrentOrder.back()];
7815 std::optional<int> Dist =
7818 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7819 return TreeEntry::Vectorize;
7823 return TreeEntry::NeedToGather;
7825 case Instruction::Call: {
7826 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7828 auto *
I = dyn_cast<Instruction>(V);
7829 return I && !
I->isFast();
7831 return TreeEntry::NeedToGather;
7834 CallInst *CI = cast<CallInst>(VL0);
7845 return TreeEntry::NeedToGather;
7850 for (
unsigned J = 0; J != NumArgs; ++J)
7853 for (
Value *V : VL) {
7854 CallInst *CI2 = dyn_cast<CallInst>(V);
7860 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7862 return TreeEntry::NeedToGather;
7866 for (
unsigned J = 0; J != NumArgs; ++J) {
7869 if (ScalarArgs[J] != A1J) {
7871 <<
"SLP: mismatched arguments in call:" << *CI
7872 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7873 return TreeEntry::NeedToGather;
7882 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7883 <<
"!=" << *V <<
'\n');
7884 return TreeEntry::NeedToGather;
7888 return TreeEntry::Vectorize;
7890 case Instruction::ShuffleVector: {
7891 if (!S.isAltShuffle()) {
7894 return TreeEntry::Vectorize;
7897 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7898 return TreeEntry::NeedToGather;
7903 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7904 "the whole alt sequence is not profitable.\n");
7905 return TreeEntry::NeedToGather;
7908 return TreeEntry::Vectorize;
7912 return TreeEntry::NeedToGather;
7926 PHIHandler() =
delete;
7928 : DT(DT), Main(Main), Phis(Phis),
7929 Operands(Main->getNumIncomingValues(),
7931 void buildOperands() {
7932 constexpr unsigned FastLimit = 4;
7942 auto *
P = dyn_cast<PHINode>(V);
7944 assert(isa<PoisonValue>(V) &&
7945 "Expected isa instruction or poison value.");
7949 if (
P->getIncomingBlock(
I) == InBB)
7964 Blocks.try_emplace(InBB).first->second.push_back(
I);
7967 if (isa<PoisonValue>(V)) {
7972 auto *
P = cast<PHINode>(V);
7973 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
7981 auto It =
Blocks.find(InBB);
7987 for (
const auto &
P :
Blocks) {
7988 if (
P.getSecond().size() <= 1)
7990 unsigned BasicI =
P.getSecond().front();
7993 [&](
const auto &Data) {
7994 return !Data.value() ||
7995 Data.value() ==
Operands[BasicI][Data.index()];
7997 "Expected empty operands list.");
8007 const EdgeInfo &UserTreeIdx,
8008 unsigned InterleaveFactor) {
8014 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8015 bool DoNotFail =
false) {
8018 for (
Value *V : VL) {
8025 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8030 size_t NumUniqueScalarValues = UniqueValues.
size();
8033 if (NumUniqueScalarValues == VL.size() &&
8035 ReuseShuffleIndices.
clear();
8038 if ((UserTreeIdx.UserTE &&
8039 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8041 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8042 "for nodes with padding.\n");
8043 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8047 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8048 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8051 if (DoNotFail && UniquePositions.size() > 1 &&
8052 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8053 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8056 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8057 if (PWSz == VL.size()) {
8058 ReuseShuffleIndices.
clear();
8060 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8062 PWSz - UniqueValues.
size(),
8064 VL = NonUniqueValueVL;
8069 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8082 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8084 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8090 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8091 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp()
8093 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8094 auto It = MultiNodeScalars.
find(S.getMainOp());
8095 if (It != MultiNodeScalars.
end()) {
8096 auto *TEIt =
find_if(It->getSecond(),
8097 [&](TreeEntry *ME) { return ME->isSame(VL); });
8098 if (TEIt != It->getSecond().end())
8108 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
8109 if (TryToFindDuplicates(S))
8110 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8111 ReuseShuffleIndices);
8115 Nodes.
insert(getTreeEntry(S.getMainOp()));
8116 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8119 if (
any_of(Nodes, [&](
const TreeEntry *E) {
8121 [&](
Value *V) { return Values.contains(V); }))
8126 all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8129 if (TryToFindDuplicates(S))
8130 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8131 ReuseShuffleIndices);
8138 E->UserTreeIndices.push_back(UserTreeIdx);
8139 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8150 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8155 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8157 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8158 if (TryToFindDuplicates(S))
8159 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8160 ReuseShuffleIndices);
8165 if (S && S.getOpcode() == Instruction::ExtractElement &&
8166 isa<ScalableVectorType>(
8167 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8168 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8169 if (TryToFindDuplicates(S))
8170 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8171 ReuseShuffleIndices);
8178 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8187 auto &&NotProfitableForVectorization = [&S,
this,
8189 if (!S || !S.isAltShuffle() || VL.size() > 2)
8198 for (
Value *V : VL) {
8199 auto *
I = cast<Instruction>(V);
8201 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8204 bool IsCommutative =
8206 if ((IsCommutative &&
8207 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8209 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8211 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8213 auto *
I1 = cast<Instruction>(VL.front());
8214 auto *I2 = cast<Instruction>(VL.back());
8215 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8217 I2->getOperand(
Op));
8218 if (
static_cast<unsigned>(
count_if(
8219 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8221 })) >= S.getMainOp()->getNumOperands() / 2)
8223 if (S.getMainOp()->getNumOperands() > 2)
8225 if (IsCommutative) {
8230 I2->getOperand((
Op + 1) % E));
8232 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8241 bool IsScatterVectorizeUserTE =
8242 UserTreeIdx.UserTE &&
8243 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8245 bool AreScatterAllGEPSameBlock =
8246 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8250 auto *
I = dyn_cast<GetElementPtrInst>(V);
8254 BB =
I->getParent();
8255 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8258 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8260 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8263 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8266 NotProfitableForVectorization(VL)) {
8267 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8268 if (TryToFindDuplicates(S))
8269 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8270 ReuseShuffleIndices);
8275 if (S && !EphValues.
empty()) {
8276 for (
Value *V : VL) {
8277 if (EphValues.
count(V)) {
8279 <<
") is ephemeral.\n");
8280 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8290 for (
Value *V : VL) {
8291 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8294 if (getTreeEntry(V)) {
8296 <<
") is already in tree.\n");
8297 if (TryToFindDuplicates(S))
8298 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8299 ReuseShuffleIndices);
8305 if (UserIgnoreList && !UserIgnoreList->empty()) {
8306 for (
Value *V : VL) {
8307 if (UserIgnoreList->contains(V)) {
8308 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8309 if (TryToFindDuplicates(S))
8310 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8311 ReuseShuffleIndices);
8319 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8320 assert(VL.front()->getType()->isPointerTy() &&
8321 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8322 "Expected pointers only.");
8324 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8325 assert(It != VL.end() &&
"Expected at least one GEP.");
8342 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8347 if (!TryToFindDuplicates(S,
true))
8353 TreeEntry::EntryState State = getScalarsVectorizationState(
8354 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8355 if (State == TreeEntry::NeedToGather) {
8356 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8357 ReuseShuffleIndices);
8361 auto &BSRef = BlocksSchedules[BB];
8363 BSRef = std::make_unique<BlockScheduling>(BB);
8365 BlockScheduling &BS = *BSRef;
8367 std::optional<ScheduleData *> Bundle =
8368 BS.tryScheduleBundle(UniqueValues,
this, S);
8369#ifdef EXPENSIVE_CHECKS
8374 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8375 assert((!BS.getScheduleData(VL0) ||
8376 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8377 "tryScheduleBundle should cancelScheduling on failure");
8378 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8379 ReuseShuffleIndices);
8380 NonScheduledFirst.insert(VL.front());
8381 if (S.getOpcode() == Instruction::Load &&
8382 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8386 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8388 unsigned ShuffleOrOp =
8389 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8390 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8393 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8398 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8403 for (
unsigned I : PHIOps)
8406 switch (ShuffleOrOp) {
8407 case Instruction::PHI: {
8408 auto *PH = cast<PHINode>(VL0);
8411 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8416 PHIHandler Handler(*DT, PH, VL);
8417 Handler.buildOperands();
8418 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8419 TE->setOperand(
I, Handler.getOperands(
I));
8421 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8426 case Instruction::ExtractValue:
8427 case Instruction::ExtractElement: {
8428 if (CurrentOrder.empty()) {
8429 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8432 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8434 for (
unsigned Idx : CurrentOrder)
8442 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8443 ReuseShuffleIndices, CurrentOrder);
8445 "(ExtractValueInst/ExtractElementInst).\n";
8449 TE->setOperand(*
this);
8452 case Instruction::InsertElement: {
8453 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8455 auto OrdCompare = [](
const std::pair<int, int> &P1,
8456 const std::pair<int, int> &P2) {
8457 return P1.first > P2.first;
8460 decltype(OrdCompare)>
8461 Indices(OrdCompare);
8462 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8464 Indices.emplace(
Idx,
I);
8466 OrdersType CurrentOrder(VL.size(), VL.size());
8467 bool IsIdentity =
true;
8468 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8469 CurrentOrder[Indices.top().second] =
I;
8470 IsIdentity &= Indices.top().second ==
I;
8474 CurrentOrder.clear();
8475 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8477 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8480 TE->setOperand(*
this);
8481 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8484 case Instruction::Load: {
8491 TreeEntry *
TE =
nullptr;
8494 case TreeEntry::Vectorize:
8495 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8496 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8497 if (CurrentOrder.empty())
8502 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8505 case TreeEntry::StridedVectorize:
8507 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8508 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8509 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8512 case TreeEntry::ScatterVectorize:
8514 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8515 UserTreeIdx, ReuseShuffleIndices);
8518 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8521 case TreeEntry::CombinedVectorize:
8522 case TreeEntry::NeedToGather:
8525 TE->setOperand(*
this);
8526 if (State == TreeEntry::ScatterVectorize)
8527 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8530 case Instruction::ZExt:
8531 case Instruction::SExt:
8532 case Instruction::FPToUI:
8533 case Instruction::FPToSI:
8534 case Instruction::FPExt:
8535 case Instruction::PtrToInt:
8536 case Instruction::IntToPtr:
8537 case Instruction::SIToFP:
8538 case Instruction::UIToFP:
8539 case Instruction::Trunc:
8540 case Instruction::FPTrunc:
8541 case Instruction::BitCast: {
8542 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8543 std::make_pair(std::numeric_limits<unsigned>::min(),
8544 std::numeric_limits<unsigned>::max()));
8545 if (ShuffleOrOp == Instruction::ZExt ||
8546 ShuffleOrOp == Instruction::SExt) {
8547 CastMaxMinBWSizes = std::make_pair(
8553 }
else if (ShuffleOrOp == Instruction::Trunc) {
8554 CastMaxMinBWSizes = std::make_pair(
8561 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8562 ReuseShuffleIndices);
8566 TE->setOperand(*
this);
8568 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8569 if (ShuffleOrOp == Instruction::Trunc) {
8570 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8571 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8572 ShuffleOrOp == Instruction::UIToFP) {
8573 unsigned NumSignBits =
8575 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8577 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8579 if (NumSignBits * 2 >=
8581 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8585 case Instruction::ICmp:
8586 case Instruction::FCmp: {
8589 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8590 ReuseShuffleIndices);
8595 VLOperands Ops(VL, VL0, *
this);
8600 "Commutative Predicate mismatch");
8602 Left = Ops.getVL(0);
8603 Right = Ops.getVL(1);
8606 for (
Value *V : VL) {
8607 if (isa<PoisonValue>(V)) {
8612 auto *
Cmp = cast<CmpInst>(V);
8615 if (
Cmp->getPredicate() != P0)
8617 Left.push_back(LHS);
8618 Right.push_back(RHS);
8625 if (ShuffleOrOp == Instruction::ICmp) {
8626 unsigned NumSignBits0 =
8628 if (NumSignBits0 * 2 >=
8630 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8631 unsigned NumSignBits1 =
8633 if (NumSignBits1 * 2 >=
8635 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8639 case Instruction::Select:
8640 case Instruction::FNeg:
8641 case Instruction::Add:
8642 case Instruction::FAdd:
8643 case Instruction::Sub:
8644 case Instruction::FSub:
8645 case Instruction::Mul:
8646 case Instruction::FMul:
8647 case Instruction::UDiv:
8648 case Instruction::SDiv:
8649 case Instruction::FDiv:
8650 case Instruction::URem:
8651 case Instruction::SRem:
8652 case Instruction::FRem:
8653 case Instruction::Shl:
8654 case Instruction::LShr:
8655 case Instruction::AShr:
8656 case Instruction::And:
8657 case Instruction::Or:
8658 case Instruction::Xor:
8659 case Instruction::Freeze: {
8660 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8661 ReuseShuffleIndices);
8663 dbgs() <<
"SLP: added a new TreeEntry "
8664 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8667 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8669 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8672 case Instruction::GetElementPtr: {
8673 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8674 ReuseShuffleIndices);
8675 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8679 for (
Value *V : VL) {
8680 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8685 Operands.front().push_back(
GEP->getPointerOperand());
8696 [VL0Ty, IndexIdx](
Value *V) {
8697 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8700 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8704 ->getPointerOperandType()
8707 for (
Value *V : VL) {
8708 auto *
I = dyn_cast<GetElementPtrInst>(V);
8711 ConstantInt::get(Ty, 0,
false));
8714 auto *
Op =
I->getOperand(IndexIdx);
8715 auto *CI = dyn_cast<ConstantInt>(
Op);
8720 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8724 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8728 case Instruction::Store: {
8729 bool Consecutive = CurrentOrder.empty();
8732 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8733 ReuseShuffleIndices, CurrentOrder);
8735 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8739 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8741 TE->setOperand(*
this);
8742 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8745 case Instruction::Call: {
8748 CallInst *CI = cast<CallInst>(VL0);
8751 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8752 ReuseShuffleIndices);
8756 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8761 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8765 case Instruction::ShuffleVector: {
8766 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8767 ReuseShuffleIndices);
8768 if (S.isAltShuffle()) {
8769 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8774 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8779 auto *CI = dyn_cast<CmpInst>(VL0);
8781 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8783 auto *MainCI = cast<CmpInst>(S.getMainOp());
8784 auto *AltCI = cast<CmpInst>(S.getAltOp());
8788 "Expected different main/alternate predicates.");
8792 for (
Value *V : VL) {
8793 if (isa<PoisonValue>(V)) {
8798 auto *
Cmp = cast<CmpInst>(V);
8809 Left.push_back(LHS);
8810 Right.push_back(RHS);
8819 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8821 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8834 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8837 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8839 for (
const auto *Ty : ST->elements())
8840 if (Ty != *ST->element_begin())
8842 N *= ST->getNumElements();
8843 EltTy = *ST->element_begin();
8844 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8845 N *= AT->getNumElements();
8846 EltTy = AT->getElementType();
8848 auto *VT = cast<FixedVectorType>(EltTy);
8849 N *= VT->getNumElements();
8850 EltTy = VT->getElementType();
8857 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8865 bool ResizeAllowed)
const {
8866 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8867 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8868 auto *E0 = cast<Instruction>(*It);
8870 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8874 Value *Vec = E0->getOperand(0);
8876 CurrentOrder.
clear();
8880 if (E0->getOpcode() == Instruction::ExtractValue) {
8885 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8889 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8892 unsigned E = VL.
size();
8893 if (!ResizeAllowed && NElts != E)
8896 unsigned MinIdx = NElts, MaxIdx = 0;
8898 auto *Inst = dyn_cast<Instruction>(V);
8901 if (Inst->getOperand(0) != Vec)
8903 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8904 if (isa<UndefValue>(EE->getIndexOperand()))
8909 const unsigned ExtIdx = *
Idx;
8910 if (ExtIdx >= NElts)
8912 Indices[
I] = ExtIdx;
8913 if (MinIdx > ExtIdx)
8915 if (MaxIdx < ExtIdx)
8918 if (MaxIdx - MinIdx + 1 > E)
8920 if (MaxIdx + 1 <= E)
8924 bool ShouldKeepOrder =
true;
8930 CurrentOrder.
assign(E, E);
8931 for (
unsigned I = 0;
I < E; ++
I) {
8934 const unsigned ExtIdx = Indices[
I] - MinIdx;
8935 if (CurrentOrder[ExtIdx] != E) {
8936 CurrentOrder.
clear();
8939 ShouldKeepOrder &= ExtIdx ==
I;
8940 CurrentOrder[ExtIdx] =
I;
8942 if (ShouldKeepOrder)
8943 CurrentOrder.
clear();
8945 return ShouldKeepOrder;
8948bool BoUpSLP::areAllUsersVectorized(
8950 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
8952 return ScalarToTreeEntry.contains(U) ||
8953 isVectorLikeInstWithConstOps(U) ||
8954 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8958static std::pair<InstructionCost, InstructionCost>
8966 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
8967 FMF = FPCI->getFastMathFlags();
8970 dyn_cast<IntrinsicInst>(CI));
8971 auto IntrinsicCost =
8978 auto LibCost = IntrinsicCost;
8985 return {IntrinsicCost, LibCost};
8988void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8992 unsigned Sz = Scalars.size();
8995 if (!ReorderIndices.empty())
8997 for (
unsigned I = 0;
I < Sz; ++
I) {
8999 if (!ReorderIndices.empty())
9001 if (isa<PoisonValue>(Scalars[
Idx]))
9003 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9004 if (IsAltOp(OpInst)) {
9014 if (!ReuseShuffleIndices.
empty()) {
9017 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9027 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9028 auto *AltCI = cast<CmpInst>(AltOp);
9031 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9032 auto *CI = cast<CmpInst>(
I);
9040 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9041 "CmpInst expected to match either main or alternate predicate or "
9044 return MainP !=
P && MainP != SwappedP;
9051 const auto *Op0 = Ops.
front();
9057 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9061 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9063 if (
auto *CI = dyn_cast<ConstantInt>(V))
9064 return CI->getValue().isPowerOf2();
9067 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9069 if (
auto *CI = dyn_cast<ConstantInt>(V))
9070 return CI->getValue().isNegatedPowerOf2();
9075 if (IsConstant && IsUniform)
9077 else if (IsConstant)
9091class BaseShuffleAnalysis {
9093 Type *ScalarTy =
nullptr;
9095 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9103 unsigned getVF(
Value *V)
const {
9104 assert(V &&
"V cannot be nullptr");
9105 assert(isa<FixedVectorType>(
V->getType()) &&
9106 "V does not have FixedVectorType");
9107 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9109 unsigned VNumElements =
9110 cast<FixedVectorType>(
V->getType())->getNumElements();
9111 assert(VNumElements > ScalarTyNumElements &&
9112 "the number of elements of V is not large enough");
9113 assert(VNumElements % ScalarTyNumElements == 0 &&
9114 "the number of elements of V is not a vectorized value");
9115 return VNumElements / ScalarTyNumElements;
9123 int Limit =
Mask.size();
9135 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9151 unsigned VF =
Mask.size();
9153 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9156 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9197 bool SinglePermute) {
9201 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9203 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9209 if (isIdentityMask(Mask, SVTy,
false)) {
9210 if (!IdentityOp || !SinglePermute ||
9211 (isIdentityMask(Mask, SVTy,
true) &&
9213 IdentityMask.
size()))) {
9218 IdentityMask.
assign(Mask);
9238 if (SV->isZeroEltSplat()) {
9240 IdentityMask.
assign(Mask);
9242 int LocalVF =
Mask.size();
9244 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9245 LocalVF = SVOpTy->getNumElements();
9249 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9251 ExtMask[
Idx] = SV->getMaskValue(
I);
9261 if (!IsOp1Undef && !IsOp2Undef) {
9263 for (
int &
I : Mask) {
9266 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9273 combineMasks(LocalVF, ShuffleMask, Mask);
9274 Mask.swap(ShuffleMask);
9276 Op = SV->getOperand(0);
9278 Op = SV->getOperand(1);
9280 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9281 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9286 "Expected masks of same sizes.");
9291 Mask.swap(IdentityMask);
9292 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9293 return SinglePermute &&
9294 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9296 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9297 Shuffle->isZeroEltSplat() &&
9310 template <
typename T,
typename ShuffleBuilderTy>
9312 ShuffleBuilderTy &Builder) {
9313 assert(V1 &&
"Expected at least one vector value.");
9315 Builder.resizeToMatch(V1, V2);
9316 int VF =
Mask.size();
9317 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9318 VF = FTy->getNumElements();
9319 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9326 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9329 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9331 CombinedMask1[
I] =
Mask[
I];
9333 CombinedMask2[
I] =
Mask[
I] - VF;
9340 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9341 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9344 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9345 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9350 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9353 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9355 ExtMask1, UseMask::SecondArg);
9360 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9363 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9365 ExtMask2, UseMask::SecondArg);
9366 if (SV1->getOperand(0)->getType() ==
9367 SV2->getOperand(0)->getType() &&
9368 SV1->getOperand(0)->getType() != SV1->getType() &&
9371 Op1 = SV1->getOperand(0);
9372 Op2 = SV2->getOperand(0);
9374 int LocalVF = ShuffleMask1.size();
9375 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9376 LocalVF = FTy->getNumElements();
9377 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9378 CombinedMask1.swap(ShuffleMask1);
9380 LocalVF = ShuffleMask2.size();
9381 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9382 LocalVF = FTy->getNumElements();
9383 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9384 CombinedMask2.swap(ShuffleMask2);
9387 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9388 Builder.resizeToMatch(Op1, Op2);
9389 VF = std::max(cast<VectorType>(Op1->
getType())
9391 .getKnownMinValue(),
9392 cast<VectorType>(Op2->
getType())
9394 .getKnownMinValue());
9395 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9398 "Expected undefined mask element");
9399 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9405 isa<ShuffleVectorInst>(Op1) &&
9406 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9408 return Builder.createIdentity(Op1);
9409 return Builder.createShuffleVector(
9413 if (isa<PoisonValue>(V1))
9414 return Builder.createPoison(
9415 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9417 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9418 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9421 return Builder.createShuffleVector(V1, NewMask);
9422 return Builder.createIdentity(V1);
9428static std::pair<InstructionCost, InstructionCost>
9439 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9449 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9453 for (
Value *V : Ptrs) {
9458 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9463 if (!
Ptr || !
Ptr->hasOneUse())
9467 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9473 TTI::PointersChainInfo::getKnownStride(),
9483 [](
const Value *V) {
9484 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9485 return Ptr && !
Ptr->hasAllConstantIndices();
9487 ? TTI::PointersChainInfo::getUnknownStride()
9488 : TTI::PointersChainInfo::getKnownStride();
9492 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9494 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9495 if (It != Ptrs.
end())
9496 BaseGEP = cast<GEPOperator>(*It);
9501 BaseGEP->getPointerOperand(), Indices, VecTy,
9506 return std::make_pair(ScalarCost, VecCost);
9509void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9510 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9511 "Expected gather node without reordering.");
9517 if (
TE.Scalars.size() == 2 || (
TE.getOpcode() && !
TE.isAltShuffle()) ||
9521 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9522 return VectorizableTree[Idx]->isSame(TE.Scalars);
9526 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9531 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9532 if (LIt != LoadsMap.
end()) {
9533 for (
LoadInst *RLI : LIt->second) {
9539 for (
LoadInst *RLI : LIt->second) {
9546 if (LIt->second.size() > 2) {
9548 hash_value(LIt->second.back()->getPointerOperand());
9554 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9559 bool IsOrdered =
true;
9560 unsigned NumInstructions = 0;
9565 if (
auto *Inst = dyn_cast<Instruction>(V);
9566 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9572 auto &Container = SortedValues[
Key];
9573 if (IsOrdered && !KeyToIndex.
contains(V) &&
9574 !(isa<Constant, ExtractElementInst>(V) ||
9576 ((Container.contains(
Idx) &&
9577 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9578 (!Container.empty() && !Container.contains(
Idx) &&
9579 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9581 auto &KTI = KeyToIndex[
V];
9583 Container[
Idx].push_back(V);
9588 if (!IsOrdered && NumInstructions > 1) {
9590 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9591 for (
const auto &
D : SortedValues) {
9592 for (
const auto &
P :
D.second) {
9594 for (
Value *V :
P.second) {
9597 TE.ReorderIndices[Cnt +
K] =
Idx;
9598 TE.Scalars[Cnt +
K] =
V;
9600 Sz += Indices.
size();
9601 Cnt += Indices.
size();
9603 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9605 *
TTI,
TE.Scalars.front()->getType(), Sz);
9607 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9609 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9610 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9617 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9622 auto *ScalarTy =
TE.Scalars.front()->getType();
9624 for (
auto [
Idx, Sz] : SubVectors) {
9628 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9633 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9634 if (DemandedElts[
I])
9637 CostKind,
I * ScalarTyNumElements, FTy);
9642 int Sz =
TE.Scalars.size();
9644 TE.ReorderIndices.end());
9645 for (
unsigned I : seq<unsigned>(Sz)) {
9647 if (isa<PoisonValue>(V)) {
9650 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9654 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9657 VecTy, ReorderMask);
9660 for (
unsigned I : seq<unsigned>(Sz)) {
9664 if (!isa<PoisonValue>(V))
9667 ReorderMask[
I] =
I + Sz;
9671 VecTy, DemandedElts,
true,
false,
CostKind);
9674 if (
Cost >= BVCost) {
9677 TE.ReorderIndices.clear();
9683 BaseGraphSize = VectorizableTree.size();
9685 class GraphTransformModeRAAI {
9686 bool &SavedIsGraphTransformMode;
9689 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9690 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9691 IsGraphTransformMode =
true;
9693 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9694 } TransformContext(IsGraphTransformMode);
9703 const InstructionsState &S) {
9705 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9707 I2->getOperand(
Op));
9709 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9711 [](
const std::pair<Value *, Value *> &
P) {
9712 return isa<Constant>(
P.first) ||
9713 isa<Constant>(
P.second) ||
P.first ==
P.second;
9720 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9721 TreeEntry &E = *VectorizableTree[
Idx];
9723 reorderGatherNode(E);
9727 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9728 TreeEntry &E = *VectorizableTree[
Idx];
9735 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9736 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9742 unsigned StartIdx = 0;
9747 *
TTI, VL.
front()->getType(), VF - 1)) {
9748 if (StartIdx + VF >
End)
9751 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9755 if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9756 SE || getTreeEntry(Slice.
back())) {
9759 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9767 bool IsSplat =
isSplat(Slice);
9768 if (Slices.
empty() || !IsSplat ||
9770 Slice.
front()->getType(), VF)),
9773 Slice.
front()->getType(), 2 * VF)),
9776 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9782 (S.getOpcode() == Instruction::Load &&
9789 if ((!UserIgnoreList || E.Idx != 0) &&
9793 if (isa<PoisonValue>(V))
9795 return areAllUsersVectorized(cast<Instruction>(V),
9799 if (S.getOpcode() == Instruction::Load) {
9811 if (UserIgnoreList && E.Idx == 0)
9816 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9819 !CheckOperandsProfitability(
9822 IsaPred<Instruction>)),
9833 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9834 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9835 if (StartIdx == Cnt)
9836 StartIdx = Cnt + Sz;
9837 if (
End == Cnt + Sz)
9840 for (
auto [Cnt, Sz] : Slices) {
9843 if (TreeEntry *SE = getTreeEntry(Slice.
front());
9844 SE || getTreeEntry(Slice.
back())) {
9847 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9849 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9850 AddCombinedNode(SE->Idx, Cnt, Sz);
9853 unsigned PrevSize = VectorizableTree.size();
9854 [[maybe_unused]]
unsigned PrevEntriesSize =
9855 LoadEntriesToVectorize.size();
9856 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9857 if (PrevSize + 1 == VectorizableTree.size() &&
9858 VectorizableTree[PrevSize]->isGather() &&
9859 VectorizableTree[PrevSize]->getOpcode() !=
9860 Instruction::ExtractElement &&
9862 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9864 VectorizableTree.pop_back();
9865 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9866 "LoadEntriesToVectorize expected to remain the same");
9869 AddCombinedNode(PrevSize, Cnt, Sz);
9873 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9876 E.ReorderIndices.clear();
9879 switch (E.getOpcode()) {
9880 case Instruction::Load: {
9883 if (E.State != TreeEntry::Vectorize)
9885 Type *ScalarTy = E.getMainOp()->getType();
9887 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9890 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9894 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9901 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9902 false, CommonAlignment,
CostKind, BaseLI);
9903 if (StridedCost < OriginalVecCost)
9906 E.State = TreeEntry::StridedVectorize;
9910 case Instruction::Store: {
9912 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9914 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9917 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9921 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9928 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9929 false, CommonAlignment,
CostKind, BaseSI);
9930 if (StridedCost < OriginalVecCost)
9933 E.State = TreeEntry::StridedVectorize;
9934 }
else if (!E.ReorderIndices.empty()) {
9937 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9938 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
9939 if (Mask.size() < 4)
9941 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9945 VecTy, Factor, BaseSI->getAlign(),
9953 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9954 if (InterleaveFactor != 0)
9955 E.setInterleave(InterleaveFactor);
9959 case Instruction::Select: {
9960 if (E.State != TreeEntry::Vectorize)
9966 E.CombinedOp = TreeEntry::MinMax;
9967 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
9968 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9969 CondEntry->State == TreeEntry::Vectorize) {
9971 CondEntry->State = TreeEntry::CombinedVectorize;
9980 if (LoadEntriesToVectorize.empty()) {
9982 if (VectorizableTree.size() <= 1 &&
9983 VectorizableTree.front()->getOpcode() == Instruction::Load)
9986 constexpr unsigned SmallTree = 3;
9987 constexpr unsigned SmallVF = 2;
9988 if ((VectorizableTree.size() <= SmallTree &&
9989 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9990 (VectorizableTree.size() <= 2 && UserIgnoreList))
9993 if (VectorizableTree.front()->isNonPowOf2Vec() &&
9997 [](
const std::unique_ptr<TreeEntry> &TE) {
9998 return TE->isGather() &&
9999 TE->getOpcode() == Instruction::Load &&
10011 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10012 TreeEntry &E = *TE;
10013 if (E.isGather() &&
10014 (E.getOpcode() == Instruction::Load ||
10015 (!E.getOpcode() &&
any_of(E.Scalars,
10017 return isa<LoadInst>(V) &&
10018 !isVectorized(V) &&
10019 !isDeleted(cast<Instruction>(V));
10022 for (
Value *V : E.Scalars) {
10023 auto *LI = dyn_cast<LoadInst>(V);
10029 *
this, V, *DL, *SE, *
TTI,
10030 GatheredLoads[std::make_tuple(
10038 if (!GatheredLoads.
empty())
10039 tryToVectorizeGatheredLoads(GatheredLoads);
10049 bool IsFinalized =
false;
10062 bool SameNodesEstimated =
true;
10071 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10087 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10088 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10091 count(VL, *It) > 1 &&
10093 if (!NeedShuffle) {
10094 if (isa<FixedVectorType>(ScalarTy)) {
10099 cast<FixedVectorType>(ScalarTy));
10102 CostKind, std::distance(VL.
begin(), It),
10108 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10115 VecTy, ShuffleMask, CostKind,
10119 return GatherCost +
10120 (
all_of(Gathers, IsaPred<UndefValue>)
10122 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10130 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10131 unsigned NumParts) {
10132 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10134 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10135 auto *EE = dyn_cast<ExtractElementInst>(V);
10138 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10141 return std::max(Sz, VecTy->getNumElements());
10147 -> std::optional<TTI::ShuffleKind> {
10148 if (NumElts <= EltsPerVector)
10149 return std::nullopt;
10151 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10153 if (I == PoisonMaskElem)
10155 return std::min(S, I);
10158 int OffsetReg1 = OffsetReg0;
10162 int FirstRegId = -1;
10163 Indices.assign(1, OffsetReg0);
10167 int Idx =
I - OffsetReg0;
10169 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10170 if (FirstRegId < 0)
10171 FirstRegId = RegId;
10172 RegIndices.
insert(RegId);
10173 if (RegIndices.
size() > 2)
10174 return std::nullopt;
10175 if (RegIndices.
size() == 2) {
10177 if (Indices.
size() == 1) {
10180 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10181 [&](
int S,
int I) {
10182 if (I == PoisonMaskElem)
10184 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10185 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10186 if (RegId == FirstRegId)
10188 return std::min(S, I);
10191 Indices.push_back(OffsetReg1 % NumElts);
10193 Idx =
I - OffsetReg1;
10195 I = (
Idx % NumElts) % EltsPerVector +
10196 (RegId == FirstRegId ? 0 : EltsPerVector);
10198 return ShuffleKind;
10205 for (
unsigned Part : seq<unsigned>(NumParts)) {
10206 if (!ShuffleKinds[Part])
10209 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10213 std::optional<TTI::ShuffleKind> RegShuffleKind =
10214 CheckPerRegistersShuffle(SubMask, Indices);
10215 if (!RegShuffleKind) {
10218 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10231 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10232 for (
unsigned Idx : Indices) {
10233 assert((
Idx + EltsPerVector) <= BaseVF &&
10234 "SK_ExtractSubvector index out of range");
10245 if (OriginalCost <
Cost)
10246 Cost = OriginalCost;
10254 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10261 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10263 unsigned SliceSize) {
10264 if (SameNodesEstimated) {
10270 if ((InVectors.
size() == 2 &&
10271 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10272 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10273 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10274 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10277 "Expected all poisoned elements.");
10279 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10284 Cost += createShuffle(InVectors.
front(),
10285 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10287 transformMaskAfterShuffle(CommonMask, CommonMask);
10288 }
else if (InVectors.
size() == 2) {
10289 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10290 transformMaskAfterShuffle(CommonMask, CommonMask);
10292 SameNodesEstimated =
false;
10293 if (!E2 && InVectors.
size() == 1) {
10294 unsigned VF = E1.getVectorFactor();
10297 cast<FixedVectorType>(V1->
getType())->getNumElements());
10299 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10300 VF = std::max(VF, E->getVectorFactor());
10302 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10304 CommonMask[
Idx] = Mask[
Idx] + VF;
10305 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10306 transformMaskAfterShuffle(CommonMask, CommonMask);
10308 auto P = InVectors.
front();
10309 Cost += createShuffle(&E1, E2, Mask);
10310 unsigned VF = Mask.size();
10315 const auto *E = cast<const TreeEntry *>(
P);
10316 VF = std::max(VF, E->getVectorFactor());
10318 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10320 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10321 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10322 transformMaskAfterShuffle(CommonMask, CommonMask);
10326 class ShuffleCostBuilder {
10329 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10331 return Mask.empty() ||
10332 (VF == Mask.size() &&
10340 ~ShuffleCostBuilder() =
default;
10345 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10346 if (isEmptyOrIdentity(Mask, VF))
10349 cast<VectorType>(V1->
getType()), Mask);
10354 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10355 if (isEmptyOrIdentity(Mask, VF))
10358 cast<VectorType>(V1->
getType()), Mask);
10364 void resizeToMatch(
Value *&,
Value *&)
const {}
10374 ShuffleCostBuilder Builder(
TTI);
10377 unsigned CommonVF = Mask.size();
10379 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10383 Type *EScalarTy = E.Scalars.front()->getType();
10384 bool IsSigned =
true;
10385 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10387 IsSigned = It->second.second;
10389 if (EScalarTy != ScalarTy) {
10390 unsigned CastOpcode = Instruction::Trunc;
10391 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10392 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10394 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10402 if (isa<Constant>(V))
10404 auto *VecTy = cast<VectorType>(V->getType());
10406 if (EScalarTy != ScalarTy) {
10408 unsigned CastOpcode = Instruction::Trunc;
10409 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10410 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10412 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10419 if (!V1 && !V2 && !P2.
isNull()) {
10421 const TreeEntry *E = cast<const TreeEntry *>(P1);
10422 unsigned VF = E->getVectorFactor();
10423 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10424 CommonVF = std::max(VF, E2->getVectorFactor());
10427 return Idx < 2 * static_cast<int>(CommonVF);
10429 "All elements in mask must be less than 2 * CommonVF.");
10430 if (E->Scalars.size() == E2->Scalars.size()) {
10434 for (
int &
Idx : CommonMask) {
10437 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10439 else if (
Idx >=
static_cast<int>(CommonVF))
10440 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10444 CommonVF = E->Scalars.size();
10445 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10446 GetNodeMinBWAffectedCost(*E2, CommonVF);
10448 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10449 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10452 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10453 }
else if (!V1 && P2.
isNull()) {
10455 const TreeEntry *E = cast<const TreeEntry *>(P1);
10456 unsigned VF = E->getVectorFactor();
10460 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10461 "All elements in mask must be less than CommonVF.");
10462 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10464 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10465 for (
int &
Idx : CommonMask) {
10469 CommonVF = E->Scalars.size();
10470 }
else if (
unsigned Factor = E->getInterleaveFactor();
10471 Factor > 0 && E->Scalars.size() != Mask.size() &&
10475 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10477 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10480 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10481 CommonVF == CommonMask.
size() &&
10483 [](
const auto &&
P) {
10485 static_cast<unsigned>(
P.value()) !=
P.index();
10493 }
else if (V1 && P2.
isNull()) {
10495 ExtraCost += GetValueMinBWAffectedCost(V1);
10496 CommonVF = getVF(V1);
10499 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10500 "All elements in mask must be less than CommonVF.");
10501 }
else if (V1 && !V2) {
10503 unsigned VF = getVF(V1);
10504 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10505 CommonVF = std::max(VF, E2->getVectorFactor());
10508 return Idx < 2 * static_cast<int>(CommonVF);
10510 "All elements in mask must be less than 2 * CommonVF.");
10511 if (E2->Scalars.size() == VF && VF != CommonVF) {
10513 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10514 for (
int &
Idx : CommonMask) {
10517 if (
Idx >=
static_cast<int>(CommonVF))
10518 Idx = E2Mask[
Idx - CommonVF] + VF;
10522 ExtraCost += GetValueMinBWAffectedCost(V1);
10524 ExtraCost += GetNodeMinBWAffectedCost(
10525 *E2, std::min(CommonVF, E2->getVectorFactor()));
10526 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10527 }
else if (!V1 && V2) {
10529 unsigned VF = getVF(V2);
10530 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10531 CommonVF = std::max(VF, E1->getVectorFactor());
10534 return Idx < 2 * static_cast<int>(CommonVF);
10536 "All elements in mask must be less than 2 * CommonVF.");
10537 if (E1->Scalars.size() == VF && VF != CommonVF) {
10539 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10540 for (
int &
Idx : CommonMask) {
10543 if (
Idx >=
static_cast<int>(CommonVF))
10544 Idx = E1Mask[
Idx - CommonVF] + VF;
10550 ExtraCost += GetNodeMinBWAffectedCost(
10551 *E1, std::min(CommonVF, E1->getVectorFactor()));
10553 ExtraCost += GetValueMinBWAffectedCost(V2);
10554 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10556 assert(V1 && V2 &&
"Expected both vectors.");
10557 unsigned VF = getVF(V1);
10558 CommonVF = std::max(VF, getVF(V2));
10561 return Idx < 2 * static_cast<int>(CommonVF);
10563 "All elements in mask must be less than 2 * CommonVF.");
10565 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10566 if (V1->
getType() != V2->getType()) {
10568 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10570 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10572 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10573 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10576 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10581 InVectors.
front() =
10583 if (InVectors.
size() == 2)
10585 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10586 V1, V2, CommonMask, Builder);
10593 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10594 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10595 CheckedExtracts(CheckedExtracts) {}
10597 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10598 unsigned NumParts,
bool &UseVecBaseAsInput) {
10599 UseVecBaseAsInput =
false;
10602 Value *VecBase =
nullptr;
10604 if (!E->ReorderIndices.empty()) {
10606 E->ReorderIndices.end());
10611 bool PrevNodeFound =
any_of(
10613 [&](
const std::unique_ptr<TreeEntry> &TE) {
10614 return ((!TE->isAltShuffle() &&
10615 TE->getOpcode() == Instruction::ExtractElement) ||
10617 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10618 return VL.size() > Data.index() &&
10619 (Mask[Data.index()] == PoisonMaskElem ||
10620 isa<UndefValue>(VL[Data.index()]) ||
10621 Data.value() == VL[Data.index()]);
10626 for (
unsigned Part : seq<unsigned>(NumParts)) {
10628 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10632 if (isa<UndefValue>(V) ||
10641 auto *EE = cast<ExtractElementInst>(V);
10642 VecBase = EE->getVectorOperand();
10643 UniqueBases.
insert(VecBase);
10644 const TreeEntry *VE = R.getTreeEntry(V);
10645 if (!CheckedExtracts.
insert(V).second ||
10646 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10649 return isa<GetElementPtrInst>(U) &&
10650 !R.areAllUsersVectorized(cast<Instruction>(U),
10658 unsigned Idx = *EEIdx;
10660 if (EE->hasOneUse() || !PrevNodeFound) {
10662 if (isa<SExtInst, ZExtInst>(Ext) &&
10663 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10668 EE->getVectorOperandType(),
Idx);
10671 Ext->getOpcode(), Ext->getType(), EE->getType(),
10686 if (!PrevNodeFound)
10687 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10690 transformMaskAfterShuffle(CommonMask, CommonMask);
10691 SameNodesEstimated =
false;
10692 if (NumParts != 1 && UniqueBases.
size() != 1) {
10693 UseVecBaseAsInput =
true;
10701 std::optional<InstructionCost>
10705 return std::nullopt;
10711 return Idx < static_cast<int>(E1.getVectorFactor());
10713 "Expected single vector shuffle mask.");
10717 if (InVectors.
empty()) {
10718 CommonMask.
assign(Mask.begin(), Mask.end());
10719 InVectors.
assign({&E1, &E2});
10722 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10725 if (NumParts == 0 || NumParts >= Mask.size() ||
10726 MaskVecTy->getNumElements() % NumParts != 0 ||
10728 MaskVecTy->getNumElements() / NumParts))
10733 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10734 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10737 if (InVectors.
empty()) {
10738 CommonMask.
assign(Mask.begin(), Mask.end());
10739 InVectors.
assign(1, &E1);
10742 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10745 if (NumParts == 0 || NumParts >= Mask.size() ||
10746 MaskVecTy->getNumElements() % NumParts != 0 ||
10748 MaskVecTy->getNumElements() / NumParts))
10753 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10754 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10755 if (!SameNodesEstimated && InVectors.
size() == 1)
10767 auto *EI = cast<ExtractElementInst>(
10768 cast<const TreeEntry *>(InVectors.
front())
10769 ->getOrdered(
P.index()));
10770 return EI->getVectorOperand() == V1 ||
10771 EI->getVectorOperand() == V2;
10773 "Expected extractelement vectors.");
10777 if (InVectors.
empty()) {
10779 "Expected empty input mask/vectors.");
10780 CommonMask.
assign(Mask.begin(), Mask.end());
10781 InVectors.
assign(1, V1);
10786 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10787 !CommonMask.
empty() &&
10790 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10791 ->getOrdered(
P.index());
10793 return P.value() == Mask[
P.index()] ||
10794 isa<UndefValue>(Scalar);
10795 if (isa<Constant>(V1))
10797 auto *EI = cast<ExtractElementInst>(Scalar);
10798 return EI->getVectorOperand() == V1;
10800 "Expected only tree entry for extractelement vectors.");
10804 "Expected only tree entries from extracts/reused buildvectors.");
10805 unsigned VF = getVF(V1);
10806 if (InVectors.
size() == 2) {
10807 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10808 transformMaskAfterShuffle(CommonMask, CommonMask);
10809 VF = std::max<unsigned>(VF, CommonMask.
size());
10810 }
else if (
const auto *InTE =
10811 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10812 VF = std::max(VF, InTE->getVectorFactor());
10815 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10816 ->getNumElements());
10819 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10821 CommonMask[
Idx] = Mask[
Idx] + VF;
10824 Value *Root =
nullptr) {
10825 Cost += getBuildVectorCost(VL, Root);
10829 unsigned VF = VL.
size();
10831 VF = std::min(VF, MaskVF);
10833 if (isa<UndefValue>(V)) {
10839 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10846 Type *ScalarTy = V->getType()->getScalarType();
10848 if (isa<PoisonValue>(V))
10850 else if (isa<UndefValue>(V))
10854 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10857 Vals.
swap(NewVals);
10863 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10870 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10873 IsFinalized =
true;
10876 if (InVectors.
size() == 2)
10877 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10879 Cost += createShuffle(Vec,
nullptr, CommonMask);
10880 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10884 "Expected vector length for the final value before action.");
10885 Value *V = cast<Value *>(Vec);
10886 Action(V, CommonMask);
10887 InVectors.
front() = V;
10889 if (!SubVectors.empty()) {
10891 if (InVectors.
size() == 2)
10892 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10894 Cost += createShuffle(Vec,
nullptr, CommonMask);
10895 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10899 if (!SubVectorsMask.
empty()) {
10901 "Expected same size of masks for subvectors and common mask.");
10903 copy(SubVectorsMask, SVMask.begin());
10904 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10907 I1 = I2 + CommonMask.
size();
10914 for (
auto [E,
Idx] : SubVectors) {
10915 Type *EScalarTy = E->Scalars.front()->getType();
10916 bool IsSigned =
true;
10917 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10920 IsSigned = It->second.second;
10922 if (ScalarTy != EScalarTy) {
10923 unsigned CastOpcode = Instruction::Trunc;
10924 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10925 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10927 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10937 if (!CommonMask.
empty()) {
10938 std::iota(std::next(CommonMask.
begin(),
Idx),
10939 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
10946 if (CommonMask.
empty()) {
10947 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
10951 createShuffle(InVectors.
front(),
10952 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
10958 "Shuffle construction must be finalized.");
10962const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
10963 unsigned Idx)
const {
10964 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
10967 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10968 return TE->isGather() &&
10969 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
10970 return EI.EdgeIdx == Idx && EI.UserTE == E;
10971 }) != TE->UserTreeIndices.end();
10973 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
10978 if (TE.State == TreeEntry::ScatterVectorize ||
10979 TE.State == TreeEntry::StridedVectorize)
10981 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10982 !TE.isAltShuffle()) {
10983 if (TE.ReorderIndices.empty())
10997 const unsigned VF,
unsigned MinBW,
11029 auto It = MinBWs.
find(E);
11030 Type *OrigScalarTy = ScalarTy;
11031 if (It != MinBWs.
end()) {
11032 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11038 unsigned EntryVF = E->getVectorFactor();
11041 if (E->isGather()) {
11044 if (isa<InsertElementInst>(VL[0]))
11046 if (isa<CmpInst>(VL.
front()))
11047 ScalarTy = VL.
front()->getType();
11048 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11049 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11053 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11056 if (E->getOpcode() == Instruction::Store) {
11058 NewMask.
resize(E->ReorderIndices.size());
11059 copy(E->ReorderIndices, NewMask.
begin());
11065 if (!E->ReuseShuffleIndices.empty())
11066 ::addMask(Mask, E->ReuseShuffleIndices);
11070 assert((E->State == TreeEntry::Vectorize ||
11071 E->State == TreeEntry::ScatterVectorize ||
11072 E->State == TreeEntry::StridedVectorize) &&
11073 "Unhandled state");
11074 assert(E->getOpcode() &&
11076 (E->getOpcode() == Instruction::GetElementPtr &&
11077 E->getMainOp()->getType()->isPointerTy())) &&
11080 unsigned ShuffleOrOp =
11081 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11082 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11083 ShuffleOrOp = E->CombinedOp;
11085 const unsigned Sz = UniqueValues.
size();
11087 for (
unsigned I = 0;
I < Sz; ++
I) {
11088 if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11090 UsedScalars.set(
I);
11092 auto GetCastContextHint = [&](
Value *
V) {
11093 if (
const TreeEntry *OpTE = getTreeEntry(V))
11094 return getCastContextHint(*OpTE);
11095 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11096 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11097 !SrcState.isAltShuffle())
11106 if (isa<CastInst, CallInst>(VL0)) {
11110 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11112 for (
unsigned I = 0;
I < Sz; ++
I) {
11113 if (UsedScalars.test(
I))
11115 ScalarCost += ScalarEltCost(
I);
11124 (E->getOpcode() != Instruction::Load ||
11125 !E->UserTreeIndices.empty())) {
11126 const EdgeInfo &EI =
11127 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11128 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11130 if (EI.UserTE->getOpcode() != Instruction::Select ||
11132 auto UserBWIt = MinBWs.
find(EI.UserTE);
11133 Type *UserScalarTy =
11134 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11135 if (UserBWIt != MinBWs.
end())
11137 UserBWIt->second.first);
11138 if (ScalarTy != UserScalarTy) {
11139 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11140 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11141 unsigned VecOpcode;
11142 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11143 if (BWSz > SrcBWSz)
11144 VecOpcode = Instruction::Trunc;
11147 It->second.second ? Instruction::SExt : Instruction::ZExt;
11154 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11155 ScalarCost,
"Calculated costs for Tree"));
11156 return VecCost - ScalarCost;
11161 assert((E->State == TreeEntry::Vectorize ||
11162 E->State == TreeEntry::StridedVectorize) &&
11163 "Entry state expected to be Vectorize or StridedVectorize here.");
11167 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11168 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11169 "Calculated GEPs cost for Tree"));
11171 return VecCost - ScalarCost;
11178 Type *CanonicalType = Ty;
11185 {CanonicalType, CanonicalType});
11190 if (VI && SelectOnly) {
11192 "Expected only for scalar type.");
11193 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11195 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11196 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11197 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11199 return IntrinsicCost;
11201 switch (ShuffleOrOp) {
11202 case Instruction::PHI: {
11206 for (
Value *V : UniqueValues) {
11207 auto *
PHI = dyn_cast<PHINode>(V);
11212 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11216 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11218 if (!OpTE->ReuseShuffleIndices.empty())
11219 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11220 OpTE->Scalars.size());
11223 return CommonCost - ScalarCost;
11225 case Instruction::ExtractValue:
11226 case Instruction::ExtractElement: {
11227 auto GetScalarCost = [&](
unsigned Idx) {
11228 if (isa<PoisonValue>(UniqueValues[
Idx]))
11231 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11233 if (ShuffleOrOp == Instruction::ExtractElement) {
11234 auto *EE = cast<ExtractElementInst>(
I);
11235 SrcVecTy = EE->getVectorOperandType();
11237 auto *EV = cast<ExtractValueInst>(
I);
11238 Type *AggregateTy = EV->getAggregateOperand()->getType();
11240 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11241 NumElts = ATy->getNumElements();
11246 if (
I->hasOneUse()) {
11248 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11249 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11256 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11264 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11265 return GetCostDiff(GetScalarCost, GetVectorCost);
11267 case Instruction::InsertElement: {
11268 assert(E->ReuseShuffleIndices.empty() &&
11269 "Unique insertelements only are expected.");
11270 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11271 unsigned const NumElts = SrcVecTy->getNumElements();
11272 unsigned const NumScalars = VL.
size();
11278 unsigned OffsetEnd = OffsetBeg;
11279 InsertMask[OffsetBeg] = 0;
11282 if (OffsetBeg >
Idx)
11284 else if (OffsetEnd <
Idx)
11286 InsertMask[
Idx] =
I + 1;
11289 if (NumOfParts > 0 && NumOfParts < NumElts)
11290 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11291 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11293 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11294 unsigned InsertVecSz = std::min<unsigned>(
11296 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11297 bool IsWholeSubvector =
11298 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11302 if (OffsetBeg + InsertVecSz > VecSz) {
11305 InsertVecSz = VecSz;
11311 if (!E->ReorderIndices.empty()) {
11316 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11318 bool IsIdentity =
true;
11320 Mask.swap(PrevMask);
11321 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11323 DemandedElts.
setBit(InsertIdx);
11324 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11325 Mask[InsertIdx - OffsetBeg] =
I;
11327 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11341 InsertVecTy, Mask);
11342 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11343 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11351 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11352 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11353 if (InsertVecSz != VecSz) {
11364 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11373 case Instruction::ZExt:
11374 case Instruction::SExt:
11375 case Instruction::FPToUI:
11376 case Instruction::FPToSI:
11377 case Instruction::FPExt:
11378 case Instruction::PtrToInt:
11379 case Instruction::IntToPtr:
11380 case Instruction::SIToFP:
11381 case Instruction::UIToFP:
11382 case Instruction::Trunc:
11383 case Instruction::FPTrunc:
11384 case Instruction::BitCast: {
11385 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11388 unsigned Opcode = ShuffleOrOp;
11389 unsigned VecOpcode = Opcode;
11391 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11393 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11394 if (SrcIt != MinBWs.
end()) {
11395 SrcBWSz = SrcIt->second.first;
11402 if (BWSz == SrcBWSz) {
11403 VecOpcode = Instruction::BitCast;
11404 }
else if (BWSz < SrcBWSz) {
11405 VecOpcode = Instruction::Trunc;
11406 }
else if (It != MinBWs.
end()) {
11407 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11408 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11409 }
else if (SrcIt != MinBWs.
end()) {
11410 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11412 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11414 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11415 !SrcIt->second.second) {
11416 VecOpcode = Instruction::UIToFP;
11419 assert(
Idx == 0 &&
"Expected 0 index only");
11427 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11429 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11432 bool IsArithmeticExtendedReduction =
11433 E->Idx == 0 && UserIgnoreList &&
11435 auto *
I = cast<Instruction>(V);
11436 return is_contained({Instruction::Add, Instruction::FAdd,
11437 Instruction::Mul, Instruction::FMul,
11438 Instruction::And, Instruction::Or,
11442 if (IsArithmeticExtendedReduction &&
11443 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11445 return CommonCost +
11447 VecOpcode == Opcode ? VI :
nullptr);
11449 return GetCostDiff(GetScalarCost, GetVectorCost);
11451 case Instruction::FCmp:
11452 case Instruction::ICmp:
11453 case Instruction::Select: {
11457 match(VL0, MatchCmp))
11463 auto GetScalarCost = [&](
unsigned Idx) {
11464 if (isa<PoisonValue>(UniqueValues[
Idx]))
11467 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11474 !
match(VI, MatchCmp)) ||
11482 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11483 CostKind, getOperandInfo(
VI->getOperand(0)),
11484 getOperandInfo(
VI->getOperand(1)), VI);
11487 ScalarCost = IntrinsicCost;
11496 CostKind, getOperandInfo(E->getOperand(0)),
11497 getOperandInfo(E->getOperand(1)), VL0);
11498 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11501 unsigned CondNumElements = CondType->getNumElements();
11503 assert(VecTyNumElements >= CondNumElements &&
11504 VecTyNumElements % CondNumElements == 0 &&
11505 "Cannot vectorize Instruction::Select");
11506 if (CondNumElements != VecTyNumElements) {
11515 return VecCost + CommonCost;
11517 return GetCostDiff(GetScalarCost, GetVectorCost);
11519 case TreeEntry::MinMax: {
11520 auto GetScalarCost = [&](
unsigned Idx) {
11521 return GetMinMaxCost(OrigScalarTy);
11525 return VecCost + CommonCost;
11527 return GetCostDiff(GetScalarCost, GetVectorCost);
11529 case Instruction::FNeg:
11530 case Instruction::Add:
11531 case Instruction::FAdd:
11532 case Instruction::Sub:
11533 case Instruction::FSub:
11534 case Instruction::Mul:
11535 case Instruction::FMul:
11536 case Instruction::UDiv:
11537 case Instruction::SDiv:
11538 case Instruction::FDiv:
11539 case Instruction::URem:
11540 case Instruction::SRem:
11541 case Instruction::FRem:
11542 case Instruction::Shl:
11543 case Instruction::LShr:
11544 case Instruction::AShr:
11545 case Instruction::And:
11546 case Instruction::Or:
11547 case Instruction::Xor: {
11548 auto GetScalarCost = [&](
unsigned Idx) {
11549 if (isa<PoisonValue>(UniqueValues[
Idx]))
11552 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11553 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11562 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11563 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11566 auto *CI = dyn_cast<ConstantInt>(
Op);
11567 return CI && CI->getValue().countr_one() >= It->second.first;
11572 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11576 Op2Info, {},
nullptr, TLI) +
11579 return GetCostDiff(GetScalarCost, GetVectorCost);
11581 case Instruction::GetElementPtr: {
11582 return CommonCost + GetGEPCostDiff(VL, VL0);
11584 case Instruction::Load: {
11585 auto GetScalarCost = [&](
unsigned Idx) {
11586 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11588 VI->getAlign(),
VI->getPointerAddressSpace(),
11591 auto *LI0 = cast<LoadInst>(VL0);
11594 switch (E->State) {
11595 case TreeEntry::Vectorize:
11596 if (
unsigned Factor = E->getInterleaveFactor()) {
11598 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11599 LI0->getPointerAddressSpace(),
CostKind);
11603 Instruction::Load, VecTy, LI0->getAlign(),
11607 case TreeEntry::StridedVectorize: {
11608 Align CommonAlignment =
11609 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11611 Instruction::Load, VecTy, LI0->getPointerOperand(),
11612 false, CommonAlignment,
CostKind);
11615 case TreeEntry::ScatterVectorize: {
11616 Align CommonAlignment =
11617 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11619 Instruction::Load, VecTy, LI0->getPointerOperand(),
11620 false, CommonAlignment,
CostKind);
11623 case TreeEntry::CombinedVectorize:
11624 case TreeEntry::NeedToGather:
11627 return VecLdCost + CommonCost;
11633 if (E->State == TreeEntry::ScatterVectorize)
11639 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11640 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11642 case Instruction::Store: {
11643 bool IsReorder = !E->ReorderIndices.empty();
11644 auto GetScalarCost = [=](
unsigned Idx) {
11645 auto *
VI = cast<StoreInst>(VL[
Idx]);
11648 VI->getAlign(),
VI->getPointerAddressSpace(),
11652 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11656 if (E->State == TreeEntry::StridedVectorize) {
11657 Align CommonAlignment =
11658 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11660 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11661 false, CommonAlignment,
CostKind);
11663 assert(E->State == TreeEntry::Vectorize &&
11664 "Expected either strided or consecutive stores.");
11665 if (
unsigned Factor = E->getInterleaveFactor()) {
11666 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11667 "No reused shuffles expected");
11670 Instruction::Store, VecTy, Factor, std::nullopt,
11671 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11675 Instruction::Store, VecTy, BaseSI->getAlign(),
11676 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11679 return VecStCost + CommonCost;
11683 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11684 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11687 return GetCostDiff(GetScalarCost, GetVectorCost) +
11688 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11690 case Instruction::Call: {
11691 auto GetScalarCost = [&](
unsigned Idx) {
11692 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11703 auto *CI = cast<CallInst>(VL0);
11707 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11709 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11711 return GetCostDiff(GetScalarCost, GetVectorCost);
11713 case Instruction::ShuffleVector: {
11714 if (!
SLPReVec || E->isAltShuffle())
11715 assert(E->isAltShuffle() &&
11720 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11721 "Invalid Shuffle Vector Operand");
11724 auto TryFindNodeWithEqualOperands = [=]() {
11725 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11728 if (
TE->isAltShuffle() &&
11729 ((
TE->getOpcode() == E->getOpcode() &&
11730 TE->getAltOpcode() == E->getAltOpcode()) ||
11731 (
TE->getOpcode() == E->getAltOpcode() &&
11732 TE->getAltOpcode() == E->getOpcode())) &&
11733 TE->hasEqualOperands(*E))
11738 auto GetScalarCost = [&](
unsigned Idx) {
11739 if (isa<PoisonValue>(UniqueValues[
Idx]))
11742 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11743 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11753 if (TryFindNodeWithEqualOperands()) {
11755 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11762 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11764 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11765 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11767 VecCost = TTIRef.getCmpSelInstrCost(
11768 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11769 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11771 VecCost += TTIRef.getCmpSelInstrCost(
11772 E->getOpcode(), VecTy, MaskTy,
11773 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11774 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11777 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11780 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11781 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11783 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11784 if (SrcIt != MinBWs.
end()) {
11785 SrcBWSz = SrcIt->second.first;
11789 if (BWSz <= SrcBWSz) {
11790 if (BWSz < SrcBWSz)
11792 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11796 <<
"SLP: alternate extension, which should be truncated.\n";
11802 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11805 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11809 E->buildAltOpShuffleMask(
11811 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11822 unsigned Opcode0 = E->getOpcode();
11823 unsigned Opcode1 = E->getAltOpcode();
11827 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11829 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11830 return AltVecCost < VecCost ? AltVecCost : VecCost;
11835 if (
SLPReVec && !E->isAltShuffle())
11836 return GetCostDiff(
11841 "Not supported shufflevector usage.");
11842 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11843 unsigned SVNumElements =
11844 cast<FixedVectorType>(SV->getOperand(0)->getType())
11845 ->getNumElements();
11846 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11847 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11851 assert(isa<ShuffleVectorInst>(V) &&
11852 "Not supported shufflevector usage.");
11853 auto *SV = cast<ShuffleVectorInst>(V);
11855 [[maybe_unused]]
bool IsExtractSubvectorMask =
11856 SV->isExtractSubvectorMask(Index);
11857 assert(IsExtractSubvectorMask &&
11858 "Not supported shufflevector usage.");
11859 if (NextIndex != Index)
11861 NextIndex += SV->getShuffleMask().size();
11864 return ::getShuffleCost(
11870 return GetCostDiff(GetScalarCost, GetVectorCost);
11872 case Instruction::Freeze:
11879bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11881 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11883 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11885 return TE->isGather() &&
11887 [
this](
Value *V) { return EphValues.contains(V); }) &&
11889 TE->Scalars.size() < Limit ||
11890 ((
TE->getOpcode() == Instruction::ExtractElement ||
11891 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11893 (
TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()) ||
11894 any_of(
TE->Scalars, IsaPred<LoadInst>));
11898 if (VectorizableTree.size() == 1 &&
11899 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11900 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11902 AreVectorizableGathers(VectorizableTree[0].
get(),
11903 VectorizableTree[0]->Scalars.size()) &&
11904 VectorizableTree[0]->getVectorFactor() > 2)))
11907 if (VectorizableTree.size() != 2)
11915 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11916 AreVectorizableGathers(VectorizableTree[1].
get(),
11917 VectorizableTree[0]->Scalars.size()))
11921 if (VectorizableTree[0]->
isGather() ||
11922 (VectorizableTree[1]->isGather() &&
11923 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11924 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11932 bool MustMatchOrInst) {
11936 Value *ZextLoad = Root;
11937 const APInt *ShAmtC;
11938 bool FoundOr =
false;
11939 while (!isa<ConstantExpr>(ZextLoad) &&
11942 ShAmtC->
urem(8) == 0))) {
11943 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11944 ZextLoad = BinOp->getOperand(0);
11945 if (BinOp->getOpcode() == Instruction::Or)
11950 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11957 Type *SrcTy = Load->getType();
11964 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
11965 << *(cast<Instruction>(Root)) <<
"\n");
11974 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11975 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11983 unsigned NumElts = Stores.
size();
11984 for (
Value *Scalar : Stores) {
11998 if (VectorizableTree.empty()) {
11999 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12005 if (VectorizableTree.size() == 2 &&
12006 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12007 VectorizableTree[1]->isGather() &&
12008 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12009 !(
isSplat(VectorizableTree[1]->Scalars) ||
12017 constexpr int Limit = 4;
12019 !VectorizableTree.empty() &&
12020 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12021 return (TE->isGather() &&
12022 TE->getOpcode() != Instruction::ExtractElement &&
12023 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12024 TE->getOpcode() == Instruction::PHI;
12035 if (isFullyVectorizableTinyTree(ForReduction))
12040 bool IsAllowedSingleBVNode =
12041 VectorizableTree.size() > 1 ||
12042 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
12043 !VectorizableTree.front()->isAltShuffle() &&
12044 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12045 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12047 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12048 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12049 return isa<ExtractElementInst, UndefValue>(V) ||
12050 (IsAllowedSingleBVNode &&
12051 !V->hasNUsesOrMore(UsesLimit) &&
12052 any_of(V->users(), IsaPred<InsertElementInst>));
12057 if (VectorizableTree.back()->isGather() &&
12058 VectorizableTree.back()->isAltShuffle() &&
12059 VectorizableTree.back()->getVectorFactor() > 2 &&
12061 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12063 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12064 VectorizableTree.back()->getVectorFactor()),
12077 constexpr unsigned SmallTree = 3;
12078 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12081 [](
const std::unique_ptr<TreeEntry> &TE) {
12082 return TE->isGather() &&
12083 TE->getOpcode() == Instruction::Load &&
12091 TreeEntry &E = *VectorizableTree[
Idx];
12094 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12108 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12121 for (
const auto &TEPtr : VectorizableTree) {
12122 if (TEPtr->State != TreeEntry::Vectorize)
12124 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12130 auto *NodeA = DT->
getNode(
A->getParent());
12131 auto *NodeB = DT->
getNode(
B->getParent());
12132 assert(NodeA &&
"Should only process reachable instructions");
12133 assert(NodeB &&
"Should only process reachable instructions");
12134 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12135 "Different nodes should have different DFS numbers");
12136 if (NodeA != NodeB)
12137 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12138 return B->comesBefore(
A);
12148 LiveValues.
erase(PrevInst);
12149 for (
auto &J : PrevInst->
operands()) {
12150 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12151 LiveValues.
insert(cast<Instruction>(&*J));
12155 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12156 for (
auto *
X : LiveValues)
12157 dbgs() <<
" " <<
X->getName();
12158 dbgs() <<
", Looking at ";
12163 unsigned NumCalls = 0;
12167 while (InstIt != PrevInstIt) {
12168 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12169 PrevInstIt = Inst->getParent()->rbegin();
12174 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12175 if (
II->isAssumeLikeIntrinsic())
12179 for (
auto &ArgOp :
II->args())
12180 Tys.push_back(ArgOp->getType());
12181 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12182 FMF = FPMO->getFastMathFlags();
12189 if (IntrCost < CallCost)
12196 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12197 &*PrevInstIt != PrevInst)
12205 for (
auto *
II : LiveValues) {
12206 auto *ScalarTy =
II->getType();
12207 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12208 ScalarTy = VectorTy->getElementType();
12226 const auto *I1 = IE1;
12227 const auto *I2 = IE2;
12239 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12241 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12242 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12244 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12245 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12252struct ValueSelect {
12253 template <
typename U>
12254 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12257 template <
typename U>
12258 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12276template <
typename T>
12282 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12284 auto VMIt = std::next(ShuffleMask.begin());
12287 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12289 if (!IsBaseUndef.
all()) {
12291 std::pair<T *, bool> Res =
12292 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12294 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12298 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12300 auto *V = ValueSelect::get<T *>(
Base);
12302 assert((!V || GetVF(V) == Mask.size()) &&
12303 "Expected base vector of VF number of elements.");
12304 Prev = Action(Mask, {
nullptr, Res.first});
12305 }
else if (ShuffleMask.size() == 1) {
12308 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12314 Prev = Action(Mask, {ShuffleMask.begin()->first});
12318 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12319 unsigned Vec2VF = GetVF(VMIt->first);
12320 if (Vec1VF == Vec2VF) {
12324 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12327 Mask[
I] = SecMask[
I] + Vec1VF;
12330 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12333 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12335 std::pair<T *, bool> Res2 =
12336 ResizeAction(VMIt->first, VMIt->second,
false);
12338 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12345 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12348 Prev = Action(Mask, {Res1.first, Res2.first});
12350 VMIt = std::next(VMIt);
12352 bool IsBaseNotUndef = !IsBaseUndef.
all();
12353 (void)IsBaseNotUndef;
12355 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12357 std::pair<T *, bool> Res =
12358 ResizeAction(VMIt->first, VMIt->second,
false);
12360 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12363 "Multiple uses of scalars.");
12364 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12369 Prev = Action(Mask, {Prev, Res.first});
12377template <
typename T>
struct ShuffledInsertData {
12388 << VectorizableTree.size() <<
".\n");
12390 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12393 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12394 TreeEntry &TE = *VectorizableTree[
I];
12397 if (TE.State == TreeEntry::CombinedVectorize) {
12399 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12400 << *TE.Scalars[0] <<
".\n";
12401 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12404 if (TE.isGather()) {
12405 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12406 E && E->getVectorFactor() == TE.getVectorFactor() &&
12407 E->isSame(TE.Scalars)) {
12412 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12419 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12420 "Expected gather nodes with users only.");
12426 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12435 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12442 for (ExternalUser &EU : ExternalUses) {
12443 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12445 for (ExternalUser &EU : ExternalUses) {
12449 if (EphValues.
count(EU.User))
12455 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12458 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12462 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12463 !ExtractCostCalculated.
insert(EU.Scalar).second)
12467 if (isa<FixedVectorType>(EU.Scalar->getType()))
12472 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12474 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12475 if (!UsedInserts.
insert(VU).second)
12479 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12482 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12487 Value *Op0 =
II->getOperand(0);
12488 if (getTreeEntry(
II) && !getTreeEntry(Op0))
12494 if (It == ShuffledInserts.
end()) {
12496 Data.InsertElements.emplace_back(VU);
12498 VecId = ShuffledInserts.
size() - 1;
12499 auto It = MinBWs.
find(ScalarTE);
12500 if (It != MinBWs.
end() &&
12502 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12504 unsigned BWSz = It->second.first;
12505 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12506 unsigned VecOpcode;
12507 if (DstBWSz < BWSz)
12508 VecOpcode = Instruction::Trunc;
12511 It->second.second ? Instruction::SExt : Instruction::ZExt;
12516 FTy->getNumElements()),
12519 <<
" for extending externally used vector with "
12520 "non-equal minimum bitwidth.\n");
12525 It->InsertElements.front() = VU;
12526 VecId = std::distance(ShuffledInserts.
begin(), It);
12528 int InIdx = *InsertIdx;
12530 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12533 Mask[InIdx] = EU.Lane;
12534 DemandedElts[VecId].setBit(InIdx);
12545 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12546 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12547 auto It = MinBWs.
find(Entry);
12548 if (It != MinBWs.
end()) {
12551 ? Instruction::ZExt
12552 : Instruction::SExt;
12559 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12562 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12563 Entry->getOpcode() == Instruction::Load) {
12565 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12566 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12567 auto *
I = cast<Instruction>(U.Scalar);
12568 const Loop *L = LI->getLoopFor(Phi->getParent());
12569 return L && (Phi->getParent() ==
I->getParent() ||
12570 L == LI->getLoopFor(
I->getParent()));
12574 if (!ValueToExtUses) {
12575 ValueToExtUses.emplace();
12578 if (IsPhiInLoop(
P.value()))
12581 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12586 auto *Inst = cast<Instruction>(EU.Scalar);
12588 auto OperandIsScalar = [&](
Value *V) {
12589 if (!getTreeEntry(V)) {
12593 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12594 return !EE->hasOneUse() || !MustGather.contains(EE);
12597 return ValueToExtUses->contains(V);
12599 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12600 bool CanBeUsedAsScalarCast =
false;
12601 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12602 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12603 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12605 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12608 if (ScalarCost + OpCost <= ExtraCost) {
12609 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12610 ScalarCost += OpCost;
12614 if (CanBeUsedAsScalar) {
12615 bool KeepScalar = ScalarCost <= ExtraCost;
12619 bool IsProfitablePHIUser =
12621 VectorizableTree.front()->Scalars.size() > 2)) &&
12622 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12626 auto *PHIUser = dyn_cast<PHINode>(U);
12627 return (!PHIUser ||
12628 PHIUser->getParent() !=
12630 VectorizableTree.front()->getMainOp())
12635 return ValueToExtUses->contains(V);
12637 if (IsProfitablePHIUser) {
12641 (!GatheredLoadsEntriesFirst.has_value() ||
12642 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12643 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12644 return ValueToExtUses->contains(V);
12646 auto It = ExtractsCount.
find(Entry);
12647 if (It != ExtractsCount.
end()) {
12648 assert(ScalarUsesCount >= It->getSecond().size() &&
12649 "Expected total number of external uses not less than "
12650 "number of scalar uses.");
12651 ScalarUsesCount -= It->getSecond().size();
12656 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12659 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12661 auto It = ValueToExtUses->find(V);
12662 if (It != ValueToExtUses->end()) {
12664 ExternalUses[It->second].User = nullptr;
12667 ExtraCost = ScalarCost;
12668 if (!IsPhiInLoop(EU))
12669 ExtractsCount[Entry].
insert(Inst);
12670 if (CanBeUsedAsScalarCast) {
12671 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12674 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12676 auto It = ValueToExtUses->find(V);
12677 if (It != ValueToExtUses->end()) {
12679 ExternalUses[It->second].User = nullptr;
12688 ExtractCost += ExtraCost;
12692 for (
Value *V : ScalarOpsFromCasts) {
12693 ExternalUsesAsOriginalScalar.
insert(V);
12694 if (
const TreeEntry *E = getTreeEntry(V)) {
12695 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12699 if (!VectorizedVals.
empty()) {
12700 const TreeEntry &Root = *VectorizableTree.front();
12701 auto BWIt = MinBWs.find(&Root);
12702 if (BWIt != MinBWs.end()) {
12703 Type *DstTy = Root.Scalars.front()->getType();
12706 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12707 if (OriginalSz != SrcSz) {
12708 unsigned Opcode = Instruction::Trunc;
12709 if (OriginalSz > SrcSz)
12710 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12712 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12724 Cost += SpillCost + ExtractCost;
12728 unsigned VF =
Mask.size();
12729 unsigned VecVF =
TE->getVectorFactor();
12731 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12734 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12740 dbgs() <<
"SLP: Adding cost " <<
C
12741 <<
" for final shuffle of insertelement external users.\n";
12742 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12744 return std::make_pair(TE,
true);
12746 return std::make_pair(TE,
false);
12749 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12750 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12751 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12755 assert((TEs.size() == 1 || TEs.size() == 2) &&
12756 "Expected exactly 1 or 2 tree entries.");
12757 if (TEs.size() == 1) {
12759 VF = TEs.front()->getVectorFactor();
12760 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12764 (
Data.index() < VF &&
12765 static_cast<int>(
Data.index()) ==
Data.value());
12770 <<
" for final shuffle of insertelement "
12771 "external users.\n";
12772 TEs.front()->
dump();
12773 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12779 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12780 VF = TEs.front()->getVectorFactor();
12784 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12788 <<
" for final shuffle of vector node and external "
12789 "insertelement users.\n";
12790 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12791 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12797 (void)performExtractsShuffleAction<const TreeEntry>(
12799 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12800 EstimateShufflesCost);
12802 cast<FixedVectorType>(
12803 ShuffledInserts[
I].InsertElements.front()->getType()),
12806 Cost -= InsertCost;
12810 if (ReductionBitWidth != 0) {
12811 assert(UserIgnoreList &&
"Expected reduction tree.");
12812 const TreeEntry &E = *VectorizableTree.front();
12813 auto It = MinBWs.find(&E);
12814 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12815 unsigned SrcSize = It->second.first;
12816 unsigned DstSize = ReductionBitWidth;
12817 unsigned Opcode = Instruction::Trunc;
12818 if (SrcSize < DstSize) {
12819 bool IsArithmeticExtendedReduction =
12821 auto *
I = cast<Instruction>(V);
12822 return is_contained({Instruction::Add, Instruction::FAdd,
12823 Instruction::Mul, Instruction::FMul,
12824 Instruction::And, Instruction::Or,
12828 if (IsArithmeticExtendedReduction)
12830 Instruction::BitCast;
12832 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12834 if (Opcode != Instruction::BitCast) {
12836 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12838 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12841 switch (E.getOpcode()) {
12842 case Instruction::SExt:
12843 case Instruction::ZExt:
12844 case Instruction::Trunc: {
12845 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12846 CCH = getCastContextHint(*OpTE);
12856 <<
" for final resize for reduction from " << SrcVecTy
12857 <<
" to " << DstVecTy <<
"\n";
12858 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12867 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12868 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12869 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12873 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12884std::optional<TTI::ShuffleKind>
12885BoUpSLP::tryToGatherSingleRegisterExtractElements(
12891 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12892 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12894 if (isa<UndefValue>(VL[
I]))
12898 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12899 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12912 ExtractMask.reset(*
Idx);
12917 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12922 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12923 return P1.second.size() > P2.second.size();
12926 const int UndefSz = UndefVectorExtracts.
size();
12927 unsigned SingleMax = 0;
12928 unsigned PairMax = 0;
12929 if (!Vectors.
empty()) {
12930 SingleMax = Vectors.
front().second.size() + UndefSz;
12931 if (Vectors.
size() > 1) {
12932 auto *ItNext = std::next(Vectors.
begin());
12933 PairMax = SingleMax + ItNext->second.size();
12936 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12937 return std::nullopt;
12943 if (SingleMax >= PairMax && SingleMax) {
12944 for (
int Idx : Vectors.
front().second)
12946 }
else if (!Vectors.
empty()) {
12947 for (
unsigned Idx : {0, 1})
12948 for (
int Idx : Vectors[
Idx].second)
12952 for (
int Idx : UndefVectorExtracts)
12956 std::optional<TTI::ShuffleKind> Res =
12962 return std::nullopt;
12966 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
12967 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
12968 isa<UndefValue>(GatheredExtracts[
I])) {
12972 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12973 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
12974 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
12989 unsigned NumParts)
const {
12990 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
12994 for (
unsigned Part : seq<unsigned>(NumParts)) {
13000 std::optional<TTI::ShuffleKind> Res =
13001 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13002 ShufflesRes[Part] = Res;
13003 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13005 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13006 return Res.has_value();
13008 ShufflesRes.clear();
13009 return ShufflesRes;
13012std::optional<TargetTransformInfo::ShuffleKind>
13013BoUpSLP::isGatherShuffledSingleRegisterEntry(
13019 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13020 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13021 :
TE->UserTreeIndices.front();
13022 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13026 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13027 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13030 TEInsertBlock = TEInsertPt->
getParent();
13033 return std::nullopt;
13034 auto *NodeUI = DT->
getNode(TEInsertBlock);
13035 assert(NodeUI &&
"Should only process reachable instructions");
13037 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13051 auto *NodeEUI = DT->
getNode(InsertBlock);
13054 assert((NodeUI == NodeEUI) ==
13055 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13056 "Different nodes should have different DFS numbers");
13058 if (TEInsertPt->
getParent() != InsertBlock &&
13061 if (TEInsertPt->
getParent() == InsertBlock &&
13075 for (
Value *V : VL) {
13080 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13081 if (TEPtr == TE || TEPtr->Idx == 0)
13084 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13085 "Must contain at least single gathered value.");
13086 assert(TEPtr->UserTreeIndices.size() == 1 &&
13087 "Expected only single user of a gather node.");
13088 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13090 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13093 : &getLastInstructionInBundle(UseEI.UserTE);
13094 if (TEInsertPt == InsertPt) {
13098 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13102 if (TEUseEI.UserTE != UseEI.UserTE &&
13103 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13109 if ((TEInsertBlock != InsertPt->
getParent() ||
13110 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13111 !CheckOrdering(InsertPt))
13115 if (
const TreeEntry *VTE = getTreeEntry(V)) {
13116 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13117 if (VTE->State != TreeEntry::Vectorize) {
13118 auto It = MultiNodeScalars.
find(V);
13119 if (It == MultiNodeScalars.
end())
13121 VTE = *It->getSecond().begin();
13123 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13124 return MTE->State == TreeEntry::Vectorize;
13126 if (MIt == It->getSecond().end())
13131 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13132 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13136 if (VToTEs.
empty())
13138 if (UsedTEs.
empty()) {
13152 if (!VToTEs.
empty()) {
13158 VToTEs = SavedVToTEs;
13167 if (UsedTEs.
size() == 2)
13169 UsedTEs.push_back(SavedVToTEs);
13176 if (UsedTEs.
empty()) {
13178 return std::nullopt;
13182 if (UsedTEs.
size() == 1) {
13185 UsedTEs.front().
end());
13186 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13187 return TE1->Idx < TE2->Idx;
13190 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13191 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13193 if (It != FirstEntries.end() &&
13194 ((*It)->getVectorFactor() == VL.size() ||
13195 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13196 TE->ReuseShuffleIndices.size() == VL.size() &&
13197 (*It)->isSame(
TE->Scalars)))) {
13198 Entries.push_back(*It);
13199 if ((*It)->getVectorFactor() == VL.size()) {
13200 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13201 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13207 for (
unsigned I : seq<unsigned>(VL.size()))
13208 if (isa<PoisonValue>(VL[
I]))
13214 Entries.push_back(FirstEntries.front());
13215 VF = FirstEntries.front()->getVectorFactor();
13218 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13221 for (
const TreeEntry *TE : UsedTEs.front()) {
13222 unsigned VF =
TE->getVectorFactor();
13223 auto It = VFToTE.
find(VF);
13224 if (It != VFToTE.
end()) {
13225 if (It->second->Idx >
TE->Idx)
13226 It->getSecond() =
TE;
13233 UsedTEs.back().
end());
13234 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13235 return TE1->Idx < TE2->Idx;
13237 for (
const TreeEntry *TE : SecondEntries) {
13238 auto It = VFToTE.
find(
TE->getVectorFactor());
13239 if (It != VFToTE.
end()) {
13241 Entries.push_back(It->second);
13242 Entries.push_back(TE);
13248 if (Entries.empty()) {
13250 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13251 return TE1->Idx < TE2->Idx;
13253 Entries.push_back(SecondEntries.front());
13254 VF = std::max(Entries.front()->getVectorFactor(),
13255 Entries.back()->getVectorFactor());
13257 VF = Entries.front()->getVectorFactor();
13261 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13264 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13265 auto *
PHI = cast<PHINode>(V);
13266 auto *PHI1 = cast<PHINode>(V1);
13271 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13273 Value *In1 = PHI1->getIncomingValue(
I);
13278 if (cast<Instruction>(In)->
getParent() !=
13288 auto MightBeIgnored = [=](
Value *
V) {
13289 auto *
I = dyn_cast<Instruction>(V);
13290 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13292 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13297 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13299 bool UsedInSameVTE =
false;
13300 auto It = UsedValuesEntry.
find(V1);
13301 if (It != UsedValuesEntry.
end())
13302 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13303 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13305 cast<Instruction>(V)->getParent() ==
13306 cast<Instruction>(V1)->getParent() &&
13307 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13312 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13314 auto It = UsedValuesEntry.
find(V);
13315 if (It == UsedValuesEntry.
end())
13321 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13322 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13324 unsigned Idx = It->second;
13331 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13332 if (!UsedIdxs.test(
I))
13338 for (std::pair<unsigned, int> &Pair : EntryLanes)
13339 if (Pair.first ==
I)
13340 Pair.first = TempEntries.
size();
13343 Entries.swap(TempEntries);
13344 if (EntryLanes.size() == Entries.size() &&
13346 .
slice(Part * VL.size(),
13347 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13353 return std::nullopt;
13356 bool IsIdentity = Entries.size() == 1;
13359 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13360 unsigned Idx = Part * VL.size() + Pair.second;
13363 (ForOrder ? std::distance(
13364 Entries[Pair.first]->Scalars.begin(),
13365 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13366 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13367 IsIdentity &=
Mask[
Idx] == Pair.second;
13369 if (ForOrder || IsIdentity || Entries.empty()) {
13370 switch (Entries.size()) {
13372 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13376 if (EntryLanes.size() > 2 || VL.size() <= 2)
13382 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13383 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13386 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13387 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13388 for (
int Idx : SubMask) {
13396 assert(MaxElement >= 0 && MinElement >= 0 &&
13397 MaxElement % VF >= MinElement % VF &&
13398 "Expected at least single element.");
13399 unsigned NewVF = std::max<unsigned>(
13401 (MaxElement % VF) -
13402 (MinElement % VF) + 1));
13407 Idx = (
Idx % VF) - (MinElement % VF) +
13408 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13415 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13416 auto GetShuffleCost = [&,
13420 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13422 Mask, Entries.front()->getInterleaveFactor()))
13424 return ::getShuffleCost(
TTI,
13429 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13432 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13433 FirstShuffleCost = ShuffleCost;
13437 bool IsIdentity =
true;
13439 if (
Idx >=
static_cast<int>(VF)) {
13444 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13448 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13450 MaskVecTy, DemandedElts,
true,
13455 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13456 SecondShuffleCost = ShuffleCost;
13460 bool IsIdentity =
true;
13462 if (
Idx <
static_cast<int>(VF) &&
Idx >= 0) {
13468 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13473 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13475 MaskVecTy, DemandedElts,
true,
13485 const TreeEntry *BestEntry =
nullptr;
13486 if (FirstShuffleCost < ShuffleCost) {
13487 copy(FirstMask, std::next(
Mask.begin(), Part * VL.size()));
13488 BestEntry = Entries.front();
13489 ShuffleCost = FirstShuffleCost;
13491 if (SecondShuffleCost < ShuffleCost) {
13492 copy(SecondMask, std::next(
Mask.begin(), Part * VL.size()));
13493 BestEntry = Entries[1];
13494 ShuffleCost = SecondShuffleCost;
13496 if (BuildVectorCost >= ShuffleCost) {
13499 Entries.push_back(BestEntry);
13507 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13509 return std::nullopt;
13513BoUpSLP::isGatherShuffledEntry(
13517 assert(NumParts > 0 && NumParts < VL.
size() &&
13518 "Expected positive number of registers.");
13521 if (TE == VectorizableTree.front().get() &&
13522 (!GatheredLoadsEntriesFirst.has_value() ||
13524 [](
const std::unique_ptr<TreeEntry> &TE) {
13525 return !
TE->isGather();
13529 if (
TE->isNonPowOf2Vec())
13532 assert((
TE->UserTreeIndices.size() == 1 ||
13533 TE == VectorizableTree.front().get()) &&
13534 "Expected only single user of the gather node.");
13536 "Number of scalars must be divisible by NumParts.");
13537 if (!
TE->UserTreeIndices.empty() &&
13538 TE->UserTreeIndices.front().UserTE->isGather() &&
13539 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13540 assert((
TE->Idx == 0 ||
TE->getOpcode() == Instruction::ExtractElement ||
13542 "Expected splat or extractelements only node.");
13547 for (
unsigned Part : seq<unsigned>(NumParts)) {
13551 std::optional<TTI::ShuffleKind> SubRes =
13552 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13555 SubEntries.
clear();
13558 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13559 (SubEntries.
front()->isSame(
TE->Scalars) ||
13560 SubEntries.
front()->isSame(VL))) {
13562 LocalSubEntries.
swap(SubEntries);
13565 std::iota(
Mask.begin(),
Mask.end(), 0);
13567 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13568 if (isa<PoisonValue>(VL[
I]))
13570 Entries.emplace_back(1, LocalSubEntries.
front());
13576 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13584 Type *ScalarTy)
const {
13586 bool DuplicateNonConst =
false;
13594 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13595 if (
V->getType() != ScalarTy) {
13606 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13609 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13617 EstimateInsertCost(
I, V);
13618 ShuffleMask[
I] =
I;
13622 DuplicateNonConst =
true;
13624 ShuffleMask[
I] = Res.first->second;
13626 if (ForPoisonSrc) {
13627 if (isa<FixedVectorType>(ScalarTy)) {
13633 for (
unsigned I : seq<unsigned>(VL.
size()))
13634 if (!ShuffledElements[
I])
13637 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13645 if (DuplicateNonConst)
13647 VecTy, ShuffleMask);
13651Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13652 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13658 auto *Front = E->getMainOp();
13660 assert(((GatheredLoadsEntriesFirst.has_value() &&
13661 E->getOpcode() == Instruction::Load && E->isGather() &&
13662 E->Idx < *GatheredLoadsEntriesFirst) ||
13664 [=](
Value *V) ->
bool {
13665 if (E->getOpcode() == Instruction::GetElementPtr &&
13666 !isa<GetElementPtrInst>(V))
13668 auto *I = dyn_cast<Instruction>(V);
13669 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13670 isVectorLikeInstWithConstOps(I);
13672 "Expected gathered loads or GEPs or instructions from same basic "
13675 auto FindLastInst = [&]() {
13677 for (
Value *V : E->Scalars) {
13678 auto *
I = dyn_cast<Instruction>(V);
13681 if (LastInst->
getParent() ==
I->getParent()) {
13686 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13687 !isa<GetElementPtrInst>(
I)) ||
13690 (GatheredLoadsEntriesFirst.has_value() &&
13691 E->getOpcode() == Instruction::Load && E->isGather() &&
13692 E->Idx < *GatheredLoadsEntriesFirst)) &&
13693 "Expected vector-like or non-GEP in GEP node insts only.");
13701 auto *NodeB = DT->
getNode(
I->getParent());
13702 assert(NodeA &&
"Should only process reachable instructions");
13703 assert(NodeB &&
"Should only process reachable instructions");
13704 assert((NodeA == NodeB) ==
13705 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13706 "Different nodes should have different DFS numbers");
13707 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13714 auto FindFirstInst = [&]() {
13716 for (
Value *V : E->Scalars) {
13717 auto *
I = dyn_cast<Instruction>(V);
13720 if (FirstInst->
getParent() ==
I->getParent()) {
13721 if (
I->comesBefore(FirstInst))
13725 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13726 !isa<GetElementPtrInst>(
I)) ||
13729 "Expected vector-like or non-GEP in GEP node insts only.");
13737 auto *NodeB = DT->
getNode(
I->getParent());
13738 assert(NodeA &&
"Should only process reachable instructions");
13739 assert(NodeB &&
"Should only process reachable instructions");
13740 assert((NodeA == NodeB) ==
13741 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13742 "Different nodes should have different DFS numbers");
13743 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13750 if (GatheredLoadsEntriesFirst.has_value() &&
13751 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13752 E->getOpcode() == Instruction::Load) {
13753 Res = FindFirstInst();
13761 if ((E->getOpcode() == Instruction::GetElementPtr &&
13764 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13768 return isa<PoisonValue>(V) ||
13769 (!isVectorLikeInstWithConstOps(V) &&
13770 isUsedOutsideBlock(V));
13772 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13773 return isa<ExtractElementInst, UndefValue>(V) ||
13774 areAllOperandsNonInsts(V);
13776 Res = FindLastInst();
13778 Res = FindFirstInst();
13786 if (BlocksSchedules.count(BB) && !E->isGather()) {
13787 Value *
V = E->isOneOf(E->Scalars.back());
13790 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13791 if (Bundle && Bundle->isPartOfBundle())
13792 for (; Bundle; Bundle = Bundle->NextInBundle)
13793 Res = Bundle->Inst;
13815 Res = FindLastInst();
13816 assert(Res &&
"Failed to find last instruction in bundle");
13820void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13821 auto *Front = E->getMainOp();
13822 Instruction *LastInst = &getLastInstructionInBundle(E);
13823 assert(LastInst &&
"Failed to find last instruction in bundle");
13826 bool IsPHI = isa<PHINode>(LastInst);
13828 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13830 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13834 Builder.SetInsertPoint(
13838 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13841Value *BoUpSLP::gather(
13850 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13853 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13854 InsertBB = InsertBB->getSinglePredecessor();
13855 return InsertBB && InsertBB == InstBB;
13857 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13858 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13859 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13860 getTreeEntry(Inst) ||
13861 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13862 PostponedIndices.
insert(
I).second)
13866 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13869 if (
Scalar->getType() != Ty) {
13873 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13874 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13876 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13877 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13880 Scalar = Builder.CreateIntCast(
13885 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13887 Vec = InsElt = Builder.CreateInsertVector(
13890 auto *
II = dyn_cast<IntrinsicInst>(InsElt);
13891 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13894 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13895 InsElt = dyn_cast<InsertElementInst>(Vec);
13899 GatherShuffleExtractSeq.
insert(InsElt);
13902 if (isa<Instruction>(V)) {
13903 if (TreeEntry *Entry = getTreeEntry(V)) {
13905 User *UserOp =
nullptr;
13907 if (
auto *SI = dyn_cast<Instruction>(Scalar))
13913 unsigned FoundLane =
Entry->findLaneForValue(V);
13914 ExternalUses.emplace_back(V, UserOp, FoundLane);
13924 std::iota(
Mask.begin(),
Mask.end(), 0);
13925 Value *OriginalRoot = Root;
13926 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13927 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13928 SV->getOperand(0)->getType() == VecTy) {
13929 Root = SV->getOperand(0);
13930 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13933 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13940 if (isa<PoisonValue>(VL[
I]))
13942 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
13946 if (isa<PoisonValue>(Vec)) {
13947 Vec = OriginalRoot;
13949 Vec = CreateShuffle(Root, Vec, Mask);
13950 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
13951 OI && OI->hasNUses(0) &&
13952 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13953 return TE->VectorizedValue == OI;
13959 for (
int I : NonConsts)
13960 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
13963 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
13964 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14002 bool IsFinalized =
false;
14015 class ShuffleIRBuilder {
14028 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14029 CSEBlocks(CSEBlocks),
DL(
DL) {}
14030 ~ShuffleIRBuilder() =
default;
14033 if (V1->
getType() != V2->getType()) {
14036 "Expected integer vector types only.");
14037 if (V1->
getType() != V2->getType()) {
14038 if (cast<VectorType>(V2->getType())
14040 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14042 ->getIntegerBitWidth())
14051 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14052 GatherShuffleExtractSeq.
insert(
I);
14053 CSEBlocks.
insert(
I->getParent());
14062 unsigned VF = Mask.size();
14063 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14067 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14068 GatherShuffleExtractSeq.
insert(
I);
14069 CSEBlocks.
insert(
I->getParent());
14073 Value *createIdentity(
Value *V) {
return V; }
14074 Value *createPoison(
Type *Ty,
unsigned VF) {
14079 void resizeToMatch(
Value *&V1,
Value *&V2) {
14080 if (V1->
getType() == V2->getType())
14082 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14083 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14084 int VF = std::max(V1VF, V2VF);
14085 int MinVF = std::min(V1VF, V2VF);
14087 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14089 Value *&
Op = MinVF == V1VF ? V1 : V2;
14091 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14092 GatherShuffleExtractSeq.
insert(
I);
14093 CSEBlocks.
insert(
I->getParent());
14106 assert(V1 &&
"Expected at least one vector value.");
14107 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14108 R.CSEBlocks, *R.DL);
14109 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14117 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14125 std::optional<bool> IsSigned = std::nullopt) {
14126 auto *VecTy = cast<VectorType>(V->getType());
14137 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14141 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14142 unsigned NumParts,
bool &UseVecBaseAsInput) {
14143 UseVecBaseAsInput =
false;
14145 Value *VecBase =
nullptr;
14147 if (!E->ReorderIndices.empty()) {
14149 E->ReorderIndices.end());
14152 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14156 auto *EI = cast<ExtractElementInst>(VL[
I]);
14157 VecBase = EI->getVectorOperand();
14158 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14159 VecBase = TE->VectorizedValue;
14160 assert(VecBase &&
"Expected vectorized value.");
14161 UniqueBases.
insert(VecBase);
14164 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14165 (NumParts != 1 &&
count(VL, EI) > 1) ||
14167 const TreeEntry *UTE = R.getTreeEntry(U);
14168 return !UTE || R.MultiNodeScalars.contains(U) ||
14169 (isa<GetElementPtrInst>(U) &&
14170 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14171 count_if(R.VectorizableTree,
14172 [&](const std::unique_ptr<TreeEntry> &TE) {
14173 return any_of(TE->UserTreeIndices,
14174 [&](const EdgeInfo &Edge) {
14175 return Edge.UserTE == UTE;
14177 is_contained(VL, EI);
14181 R.eraseInstruction(EI);
14183 if (NumParts == 1 || UniqueBases.
size() == 1) {
14184 assert(VecBase &&
"Expected vectorized value.");
14185 return castToScalarTyElem(VecBase);
14187 UseVecBaseAsInput =
true;
14197 Value *Vec =
nullptr;
14200 for (
unsigned Part : seq<unsigned>(NumParts)) {
14204 constexpr int MaxBases = 2;
14206 auto VLMask =
zip(SubVL, SubMask);
14207 const unsigned VF = std::accumulate(
14208 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14209 if (std::get<1>(D) == PoisonMaskElem)
14212 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14213 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14214 VecOp = TE->VectorizedValue;
14215 assert(VecOp &&
"Expected vectorized value.");
14216 const unsigned Size =
14217 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14218 return std::max(S, Size);
14220 for (
const auto [V,
I] : VLMask) {
14223 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14224 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14225 VecOp = TE->VectorizedValue;
14226 assert(VecOp &&
"Expected vectorized value.");
14227 VecOp = castToScalarTyElem(VecOp);
14228 Bases[
I / VF] = VecOp;
14230 if (!Bases.front())
14233 if (Bases.back()) {
14234 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14235 TransformToIdentity(SubMask);
14237 SubVec = Bases.front();
14244 Mask.slice(
P * SliceSize,
14251 "Expected first part or all previous parts masked.");
14252 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14255 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14257 unsigned SubVecVF =
14258 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14259 NewVF = std::max(NewVF, SubVecVF);
14262 for (
int &
Idx : SubMask)
14265 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14266 Vec = createShuffle(Vec, SubVec, VecMask);
14267 TransformToIdentity(VecMask);
14275 std::optional<Value *>
14281 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14283 return std::nullopt;
14286 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14295 Value *V1 = E1.VectorizedValue;
14297 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14298 if (isa<PoisonValue>(V))
14300 return !isKnownNonNegative(
14301 V, SimplifyQuery(*R.DL));
14303 Value *V2 = E2.VectorizedValue;
14304 if (V2->getType()->isIntOrIntVectorTy())
14305 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14306 if (isa<PoisonValue>(V))
14308 return !isKnownNonNegative(
14309 V, SimplifyQuery(*R.DL));
14316 Value *V1 = E1.VectorizedValue;
14318 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14319 if (isa<PoisonValue>(V))
14321 return !isKnownNonNegative(
14322 V, SimplifyQuery(*R.DL));
14328 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14330 isa<FixedVectorType>(V2->getType()) &&
14331 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14332 V1 = castToScalarTyElem(V1);
14333 V2 = castToScalarTyElem(V2);
14334 if (InVectors.
empty()) {
14337 CommonMask.
assign(Mask.begin(), Mask.end());
14341 if (InVectors.
size() == 2) {
14342 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14343 transformMaskAfterShuffle(CommonMask, CommonMask);
14344 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14346 Vec = createShuffle(Vec,
nullptr, CommonMask);
14347 transformMaskAfterShuffle(CommonMask, CommonMask);
14349 V1 = createShuffle(V1, V2, Mask);
14350 unsigned VF = std::max(getVF(V1), getVF(Vec));
14351 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14353 CommonMask[
Idx] =
Idx + VF;
14354 InVectors.
front() = Vec;
14355 if (InVectors.
size() == 2)
14356 InVectors.
back() = V1;
14363 "castToScalarTyElem expects V1 to be FixedVectorType");
14364 V1 = castToScalarTyElem(V1);
14365 if (InVectors.
empty()) {
14367 CommonMask.
assign(Mask.begin(), Mask.end());
14370 const auto *It =
find(InVectors, V1);
14371 if (It == InVectors.
end()) {
14372 if (InVectors.
size() == 2 ||
14375 if (InVectors.
size() == 2) {
14376 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14377 transformMaskAfterShuffle(CommonMask, CommonMask);
14378 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14379 CommonMask.
size()) {
14380 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14381 transformMaskAfterShuffle(CommonMask, CommonMask);
14383 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14384 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14387 V->getType() != V1->
getType()
14389 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14390 ->getNumElements();
14391 if (V->getType() != V1->
getType())
14392 V1 = createShuffle(V1,
nullptr, Mask);
14393 InVectors.
front() = V;
14394 if (InVectors.
size() == 2)
14395 InVectors.
back() = V1;
14402 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14408 int VF = getVF(V1);
14409 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14411 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
14420 Value *Root =
nullptr) {
14421 return R.gather(VL, Root, ScalarTy,
14423 return createShuffle(V1, V2, Mask);
14432 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14435 IsFinalized =
true;
14438 if (ScalarTyNumElements != 1) {
14442 ExtMask = NewExtMask;
14446 if (InVectors.
size() == 2) {
14447 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14450 Vec = createShuffle(Vec,
nullptr, CommonMask);
14452 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14456 "Expected vector length for the final value before action.");
14457 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14460 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14461 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14463 Action(Vec, CommonMask);
14464 InVectors.
front() = Vec;
14466 if (!SubVectors.empty()) {
14468 if (InVectors.
size() == 2) {
14469 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14472 Vec = createShuffle(Vec,
nullptr, CommonMask);
14474 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14477 auto CreateSubVectors = [&](
Value *Vec,
14479 for (
auto [E,
Idx] : SubVectors) {
14480 Value *
V = E->VectorizedValue;
14481 if (
V->getType()->isIntOrIntVectorTy())
14482 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14483 if (isa<PoisonValue>(V))
14485 return !isKnownNonNegative(
14486 V, SimplifyQuery(*R.DL));
14488 unsigned InsertionIndex =
Idx * ScalarTyNumElements;
14489 const unsigned SubVecVF =
14490 cast<FixedVectorType>(
V->getType())->getNumElements();
14491 if (InsertionIndex % SubVecVF == 0) {
14493 Builder.
getInt64(InsertionIndex));
14497 const unsigned VecVF =
14498 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14500 std::iota(
Mask.begin(),
Mask.end(), 0);
14501 for (
unsigned I : seq<unsigned>(
14502 InsertionIndex, (
Idx + SubVecVF) * ScalarTyNumElements))
14504 Vec = createShuffle(Vec, V, Mask);
14506 if (!CommonMask.
empty()) {
14508 std::next(CommonMask.
begin(), InsertionIndex),
14509 std::next(CommonMask.
begin(),
14510 (
Idx + E->getVectorFactor()) * ScalarTyNumElements),
14516 if (SubVectorsMask.
empty()) {
14517 Vec = CreateSubVectors(Vec, CommonMask);
14520 copy(SubVectorsMask, SVMask.begin());
14521 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14524 I1 = I2 + CommonMask.
size();
14529 Vec = createShuffle(InsertVec, Vec, SVMask);
14530 for (
unsigned I : seq<unsigned>(CommonMask.
size())) {
14535 InVectors.
front() = Vec;
14538 if (!ExtMask.
empty()) {
14539 if (CommonMask.
empty()) {
14543 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14546 NewMask[
I] = CommonMask[ExtMask[
I]];
14548 CommonMask.
swap(NewMask);
14551 if (CommonMask.
empty()) {
14552 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14553 return InVectors.
front();
14555 if (InVectors.
size() == 2)
14556 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14557 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14562 "Shuffle construction must be finalized.");
14566BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14567 unsigned NodeIdx) {
14571 if (!S && VL.
front()->getType()->isPointerTy()) {
14572 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14573 if (It != VL.
end())
14578 auto CheckSameVE = [&](
const TreeEntry *VE) {
14579 return VE->isSame(VL) &&
14580 (
any_of(VE->UserTreeIndices,
14581 [E, NodeIdx](
const EdgeInfo &EI) {
14582 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14584 any_of(VectorizableTree,
14585 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14586 return TE->isOperandGatherNode(
14587 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14588 VE->isSame(TE->Scalars);
14591 TreeEntry *VE = getTreeEntry(S.getMainOp());
14592 if (VE && CheckSameVE(VE))
14594 auto It = MultiNodeScalars.
find(S.getMainOp());
14595 if (It != MultiNodeScalars.
end()) {
14596 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14597 return TE != VE && CheckSameVE(TE);
14599 if (
I != It->getSecond().end())
14605Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14606 bool PostponedPHIs) {
14607 ValueList &VL = E->getOperand(NodeIdx);
14608 const unsigned VF = VL.size();
14609 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14614 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14616 ShuffleInstructionBuilder ShuffleBuilder(
14620 ShuffleBuilder.add(V, Mask);
14622 E->CombinedEntriesWithIndices.size());
14623 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14624 [&](
const auto &
P) {
14625 return std::make_pair(VectorizableTree[P.first].get(),
14628 assert((E->CombinedEntriesWithIndices.empty() ||
14629 E->ReorderIndices.empty()) &&
14630 "Expected either combined subnodes or reordering");
14631 return ShuffleBuilder.finalize({}, SubVectors, {});
14635 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14636 if (!VE->ReuseShuffleIndices.empty()) {
14657 if (isa<PoisonValue>(V))
14659 Mask[
I] = VE->findLaneForValue(V);
14661 V = FinalShuffle(V, Mask);
14663 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14664 "Expected vectorization factor less "
14665 "than original vector size.");
14667 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14668 V = FinalShuffle(V, UniformMask);
14674 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14675 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14676 }) == VE->UserTreeIndices.end()) {
14678 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14679 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14680 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14682 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14683 (*It)->VectorizedValue =
V;
14691 auto *
I =
find_if(VectorizableTree,
14692 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14693 return TE->isOperandGatherNode({E, NodeIdx});
14695 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14696 assert(
I->get()->UserTreeIndices.size() == 1 &&
14697 "Expected only single user for the gather node.");
14698 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14702template <
typename BVTy,
typename ResTy,
typename...
Args>
14703ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14705 assert(E->isGather() &&
"Expected gather node.");
14706 unsigned VF = E->getVectorFactor();
14708 bool NeedFreeze =
false;
14710 E->ReuseShuffleIndices.end());
14713 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14715 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14718 E->CombinedEntriesWithIndices.size());
14719 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14720 [&](
const auto &
P) {
14721 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14726 E->ReorderIndices.end());
14727 if (!ReorderMask.empty())
14733 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14734 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14735 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14738 SubVectorsMask.
clear();
14742 unsigned I,
unsigned SliceSize,
14743 bool IsNotPoisonous) {
14745 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14748 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14749 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14750 if (UserTE->getNumOperands() != 2)
14752 if (!IsNotPoisonous) {
14754 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14755 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14756 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14757 }) !=
TE->UserTreeIndices.end();
14759 if (It == VectorizableTree.end())
14762 if (!(*It)->ReorderIndices.empty()) {
14766 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14767 Value *V0 = std::get<0>(
P);
14768 Value *V1 = std::get<1>(
P);
14769 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14770 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14776 if ((
Mask.size() < InputVF &&
14779 (
Mask.size() == InputVF &&
14782 std::next(
Mask.begin(),
I * SliceSize),
14783 std::next(
Mask.begin(),
14790 std::next(
Mask.begin(),
I * SliceSize),
14791 std::next(
Mask.begin(),
14797 BVTy ShuffleBuilder(ScalarTy, Params...);
14798 ResTy Res = ResTy();
14802 Value *ExtractVecBase =
nullptr;
14803 bool UseVecBaseAsInput =
false;
14806 Type *OrigScalarTy = GatheredScalars.front()->getType();
14809 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14814 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14816 bool Resized =
false;
14818 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14819 if (!ExtractShuffles.
empty()) {
14824 if (
const auto *TE = getTreeEntry(
14825 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14828 if (std::optional<ResTy> Delayed =
14829 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14831 PostponedGathers.
insert(E);
14836 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14837 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14838 ExtractVecBase = VecBase;
14839 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14840 if (VF == VecBaseTy->getNumElements() &&
14841 GatheredScalars.size() != VF) {
14843 GatheredScalars.append(VF - GatheredScalars.size(),
14849 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
14850 ((E->getOpcode() == Instruction::Load ||
14851 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14854 return isa<LoadInst>(V) && getTreeEntry(V);
14856 E->isAltShuffle() ||
14857 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14859 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14861 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14863 if (!GatherShuffles.
empty()) {
14864 if (std::optional<ResTy> Delayed =
14865 ShuffleBuilder.needToDelay(E, Entries)) {
14867 PostponedGathers.
insert(E);
14872 if (GatherShuffles.
size() == 1 &&
14874 Entries.front().front()->isSame(E->Scalars)) {
14877 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14880 Mask.resize(E->Scalars.size());
14881 const TreeEntry *FrontTE = Entries.front().front();
14882 if (FrontTE->ReorderIndices.empty() &&
14883 ((FrontTE->ReuseShuffleIndices.empty() &&
14884 E->Scalars.size() == FrontTE->Scalars.size()) ||
14885 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14886 std::iota(
Mask.begin(),
Mask.end(), 0);
14889 if (isa<PoisonValue>(V)) {
14893 Mask[
I] = FrontTE->findLaneForValue(V);
14896 ShuffleBuilder.add(*FrontTE, Mask);
14897 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14902 if (GatheredScalars.size() != VF &&
14904 return any_of(TEs, [&](
const TreeEntry *TE) {
14905 return TE->getVectorFactor() == VF;
14908 GatheredScalars.append(VF - GatheredScalars.size(),
14912 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14920 bool IsRootPoison) {
14923 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14930 int NumNonConsts = 0;
14933 if (isa<UndefValue>(V)) {
14934 if (!isa<PoisonValue>(V)) {
14949 Scalars.
front() = OrigV;
14952 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
14953 Scalars[Res.first->second] = OrigV;
14954 ReuseMask[
I] = Res.first->second;
14957 if (NumNonConsts == 1) {
14962 if (!UndefPos.
empty() && UndefPos.
front() == 0)
14965 ReuseMask[SinglePos] = SinglePos;
14966 }
else if (!UndefPos.
empty() && IsSplat) {
14971 return !isa<UndefValue>(V) &&
14973 (E->UserTreeIndices.size() == 1 &&
14977 return E->UserTreeIndices.front().EdgeIdx !=
14978 U.getOperandNo() &&
14980 E->UserTreeIndices.front().UserTE->Scalars,
14984 if (It != Scalars.
end()) {
14986 int Pos = std::distance(Scalars.
begin(), It);
14987 for (
int I : UndefPos) {
14989 ReuseMask[
I] = Pos;
14998 for (
int I : UndefPos) {
15000 if (isa<UndefValue>(Scalars[
I]))
15007 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15008 bool IsNonPoisoned =
true;
15009 bool IsUsedInExpr =
true;
15010 Value *Vec1 =
nullptr;
15011 if (!ExtractShuffles.
empty()) {
15015 Value *Vec2 =
nullptr;
15016 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15020 if (UseVecBaseAsInput) {
15021 Vec1 = ExtractVecBase;
15023 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15026 if (isa<UndefValue>(E->Scalars[
I]))
15028 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15029 Value *VecOp = EI->getVectorOperand();
15030 if (
const auto *TE = getTreeEntry(VecOp))
15031 if (
TE->VectorizedValue)
15032 VecOp =
TE->VectorizedValue;
15035 }
else if (Vec1 != VecOp) {
15036 assert((!Vec2 || Vec2 == VecOp) &&
15037 "Expected only 1 or 2 vectors shuffle.");
15043 IsUsedInExpr =
false;
15046 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15049 IsUsedInExpr &= FindReusedSplat(
15051 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15052 ExtractMask.size(), IsNotPoisonedVec);
15053 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15054 IsNonPoisoned &= IsNotPoisonedVec;
15056 IsUsedInExpr =
false;
15061 if (!GatherShuffles.
empty()) {
15064 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15067 "No shuffles with empty entries list expected.");
15071 "Expected shuffle of 1 or 2 entries.");
15075 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15076 if (TEs.
size() == 1) {
15077 bool IsNotPoisonedVec =
15078 TEs.
front()->VectorizedValue
15082 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15083 SliceSize, IsNotPoisonedVec);
15084 ShuffleBuilder.add(*TEs.
front(), VecMask);
15085 IsNonPoisoned &= IsNotPoisonedVec;
15087 IsUsedInExpr =
false;
15088 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15089 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15100 int EMSz = ExtractMask.size();
15101 int MSz =
Mask.size();
15104 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15105 bool IsIdentityShuffle =
15106 ((UseVecBaseAsInput ||
15108 [](
const std::optional<TTI::ShuffleKind> &SK) {
15112 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15114 (!GatherShuffles.
empty() &&
15116 [](
const std::optional<TTI::ShuffleKind> &SK) {
15120 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15122 bool EnoughConstsForShuffle =
15126 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15130 return isa<Constant>(V) && !isa<UndefValue>(V);
15132 (!IsIdentityShuffle ||
15133 (GatheredScalars.size() == 2 &&
15135 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15137 return isa<Constant>(V) && !isa<PoisonValue>(V);
15141 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15142 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15148 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15150 TryPackScalars(GatheredScalars, BVMask,
true);
15151 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15152 ShuffleBuilder.add(BV, BVMask);
15155 return isa<PoisonValue>(V) ||
15156 (IsSingleShuffle && ((IsIdentityShuffle &&
15157 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15159 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15162 Res = ShuffleBuilder.finalize(
15163 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15165 TryPackScalars(NonConstants, Mask,
false);
15166 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15171 TryPackScalars(GatheredScalars, ReuseMask,
true);
15172 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15173 ShuffleBuilder.add(BV, ReuseMask);
15174 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15179 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15180 if (!isa<PoisonValue>(V))
15183 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15184 ShuffleBuilder.add(BV, Mask);
15185 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15190 Res = ShuffleBuilder.createFreeze(Res);
15194Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15195 bool PostponedPHIs) {
15196 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15198 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15206 for (
Value *V : VL)
15207 if (isa<Instruction>(V))
15215 if (E->VectorizedValue &&
15216 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15217 E->isAltShuffle())) {
15218 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15219 return E->VectorizedValue;
15222 Value *
V = E->Scalars.front();
15223 Type *ScalarTy =
V->getType();
15224 if (!isa<CmpInst>(V))
15226 auto It = MinBWs.
find(E);
15227 if (It != MinBWs.
end()) {
15228 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15234 if (E->isGather()) {
15236 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15237 setInsertPointAfterBundle(E);
15238 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15239 E->VectorizedValue = Vec;
15243 bool IsReverseOrder =
15244 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15245 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15246 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15247 if (E->getOpcode() == Instruction::Store &&
15248 E->State == TreeEntry::Vectorize) {
15250 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15251 E->ReorderIndices.size());
15252 ShuffleBuilder.add(V, Mask);
15253 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15254 ShuffleBuilder.addOrdered(V, {});
15256 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15259 E->CombinedEntriesWithIndices.size());
15261 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15262 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15265 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15266 "Expected either combined subnodes or reordering");
15267 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15270 assert(!E->isGather() &&
"Unhandled state");
15271 unsigned ShuffleOrOp =
15272 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15274 auto GetOperandSignedness = [&](
unsigned Idx) {
15275 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15276 bool IsSigned =
false;
15277 auto It = MinBWs.
find(OpE);
15278 if (It != MinBWs.
end())
15279 IsSigned = It->second.second;
15282 if (isa<PoisonValue>(V))
15284 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15288 switch (ShuffleOrOp) {
15289 case Instruction::PHI: {
15290 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15291 E != VectorizableTree.front().get() ||
15292 !E->UserTreeIndices.empty()) &&
15293 "PHI reordering is free.");
15294 if (PostponedPHIs && E->VectorizedValue)
15295 return E->VectorizedValue;
15296 auto *PH = cast<PHINode>(VL0);
15298 PH->getParent()->getFirstNonPHIIt());
15300 if (PostponedPHIs || !E->VectorizedValue) {
15307 PH->getParent()->getFirstInsertionPt());
15310 V = FinalShuffle(V, E);
15312 E->VectorizedValue =
V;
15316 PHINode *NewPhi = cast<PHINode>(E->PHI);
15325 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15331 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15335 if (!VisitedBBs.
insert(IBB).second) {
15342 Value *Vec = vectorizeOperand(E,
I,
true);
15343 if (VecTy != Vec->
getType()) {
15345 MinBWs.
contains(getOperandEntry(E,
I))) &&
15346 "Expected item in MinBWs.");
15347 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15353 "Invalid number of incoming values");
15354 assert(E->VectorizedValue &&
"Expected vectorized value.");
15355 return E->VectorizedValue;
15358 case Instruction::ExtractElement: {
15359 Value *
V = E->getSingleOperand(0);
15360 if (
const TreeEntry *TE = getTreeEntry(V))
15361 V =
TE->VectorizedValue;
15362 setInsertPointAfterBundle(E);
15363 V = FinalShuffle(V, E);
15364 E->VectorizedValue =
V;
15367 case Instruction::ExtractValue: {
15368 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15373 NewV = FinalShuffle(NewV, E);
15374 E->VectorizedValue = NewV;
15377 case Instruction::InsertElement: {
15378 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15380 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15382 Type *ScalarTy =
Op.front()->getType();
15383 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15385 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15386 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15391 cast<FixedVectorType>(
V->getType())->getNumElements()),
15396 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15397 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15399 const unsigned NumElts =
15400 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15401 const unsigned NumScalars = E->Scalars.size();
15404 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15408 if (!E->ReorderIndices.empty()) {
15413 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15416 bool IsIdentity =
true;
15418 Mask.swap(PrevMask);
15419 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15422 IsIdentity &= InsertIdx -
Offset ==
I;
15425 if (!IsIdentity || NumElts != NumScalars) {
15427 bool IsVNonPoisonous =
15430 if (NumElts != NumScalars &&
Offset == 0) {
15439 InsertMask[*InsertIdx] = *InsertIdx;
15440 if (!
Ins->hasOneUse())
15442 Ins = dyn_cast_or_null<InsertElementInst>(
15443 Ins->getUniqueUndroppableUser());
15446 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15448 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15451 if (!IsFirstPoison.
all()) {
15453 for (
unsigned I = 0;
I < NumElts;
I++) {
15455 IsFirstUndef.
test(
I)) {
15456 if (IsVNonPoisonous) {
15457 InsertMask[
I] =
I < NumScalars ?
I : 0;
15462 if (
Idx >= NumScalars)
15463 Idx = NumScalars - 1;
15464 InsertMask[
I] = NumScalars +
Idx;
15478 if (
auto *
I = dyn_cast<Instruction>(V)) {
15479 GatherShuffleExtractSeq.
insert(
I);
15480 CSEBlocks.
insert(
I->getParent());
15485 for (
unsigned I = 0;
I < NumElts;
I++) {
15490 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15493 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15494 NumElts != NumScalars) {
15495 if (IsFirstUndef.
all()) {
15498 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15499 if (!IsFirstPoison.
all()) {
15500 for (
unsigned I = 0;
I < NumElts;
I++) {
15502 InsertMask[
I] =
I + NumElts;
15509 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15510 if (
auto *
I = dyn_cast<Instruction>(V)) {
15511 GatherShuffleExtractSeq.
insert(
I);
15512 CSEBlocks.
insert(
I->getParent());
15517 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15518 for (
unsigned I = 0;
I < NumElts;
I++) {
15522 InsertMask[
I] += NumElts;
15525 FirstInsert->getOperand(0), V, InsertMask,
15526 cast<Instruction>(E->Scalars.back())->getName());
15527 if (
auto *
I = dyn_cast<Instruction>(V)) {
15528 GatherShuffleExtractSeq.
insert(
I);
15529 CSEBlocks.
insert(
I->getParent());
15534 ++NumVectorInstructions;
15535 E->VectorizedValue =
V;
15538 case Instruction::ZExt:
15539 case Instruction::SExt:
15540 case Instruction::FPToUI:
15541 case Instruction::FPToSI:
15542 case Instruction::FPExt:
15543 case Instruction::PtrToInt:
15544 case Instruction::IntToPtr:
15545 case Instruction::SIToFP:
15546 case Instruction::UIToFP:
15547 case Instruction::Trunc:
15548 case Instruction::FPTrunc:
15549 case Instruction::BitCast: {
15550 setInsertPointAfterBundle(E);
15552 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15553 if (E->VectorizedValue) {
15554 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15555 return E->VectorizedValue;
15558 auto *CI = cast<CastInst>(VL0);
15560 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15561 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15563 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15566 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15567 if (SrcIt != MinBWs.
end())
15568 SrcBWSz = SrcIt->second.first;
15570 if (BWSz == SrcBWSz) {
15571 VecOpcode = Instruction::BitCast;
15572 }
else if (BWSz < SrcBWSz) {
15573 VecOpcode = Instruction::Trunc;
15574 }
else if (It != MinBWs.
end()) {
15575 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15576 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15577 }
else if (SrcIt != MinBWs.
end()) {
15578 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15580 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15582 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15583 !SrcIt->second.second) {
15584 VecOpcode = Instruction::UIToFP;
15586 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15588 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15589 V = FinalShuffle(V, E);
15591 E->VectorizedValue =
V;
15592 ++NumVectorInstructions;
15595 case Instruction::FCmp:
15596 case Instruction::ICmp: {
15597 setInsertPointAfterBundle(E);
15599 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15600 if (E->VectorizedValue) {
15601 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15602 return E->VectorizedValue;
15604 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15605 if (E->VectorizedValue) {
15606 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15607 return E->VectorizedValue;
15609 if (
L->getType() !=
R->getType()) {
15611 getOperandEntry(E, 1)->
isGather() ||
15612 MinBWs.
contains(getOperandEntry(E, 0)) ||
15613 MinBWs.
contains(getOperandEntry(E, 1))) &&
15614 "Expected item in MinBWs.");
15615 if (cast<VectorType>(
L->getType())
15617 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15619 ->getIntegerBitWidth()) {
15620 Type *CastTy =
R->getType();
15623 Type *CastTy =
L->getType();
15631 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15632 ICmp->setSameSign(
false);
15634 VecTy = cast<FixedVectorType>(
V->getType());
15635 V = FinalShuffle(V, E);
15637 E->VectorizedValue =
V;
15638 ++NumVectorInstructions;
15641 case Instruction::Select: {
15642 setInsertPointAfterBundle(E);
15644 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15645 if (E->VectorizedValue) {
15646 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15647 return E->VectorizedValue;
15649 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15650 if (E->VectorizedValue) {
15651 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15652 return E->VectorizedValue;
15654 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15655 if (E->VectorizedValue) {
15656 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15657 return E->VectorizedValue;
15661 getOperandEntry(E, 2)->
isGather() ||
15662 MinBWs.
contains(getOperandEntry(E, 1)) ||
15663 MinBWs.
contains(getOperandEntry(E, 2))) &&
15664 "Expected item in MinBWs.");
15665 if (True->
getType() != VecTy)
15666 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15667 if (False->
getType() != VecTy)
15668 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15673 assert(TrueNumElements >= CondNumElements &&
15674 TrueNumElements % CondNumElements == 0 &&
15675 "Cannot vectorize Instruction::Select");
15677 "Cannot vectorize Instruction::Select");
15678 if (CondNumElements != TrueNumElements) {
15686 "Cannot vectorize Instruction::Select");
15688 V = FinalShuffle(V, E);
15690 E->VectorizedValue =
V;
15691 ++NumVectorInstructions;
15694 case Instruction::FNeg: {
15695 setInsertPointAfterBundle(E);
15697 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15699 if (E->VectorizedValue) {
15700 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15701 return E->VectorizedValue;
15707 if (
auto *
I = dyn_cast<Instruction>(V))
15710 V = FinalShuffle(V, E);
15712 E->VectorizedValue =
V;
15713 ++NumVectorInstructions;
15717 case Instruction::Freeze: {
15718 setInsertPointAfterBundle(E);
15720 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15722 if (E->VectorizedValue) {
15723 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15724 return E->VectorizedValue;
15727 if (
Op->getType() != VecTy) {
15729 MinBWs.
contains(getOperandEntry(E, 0))) &&
15730 "Expected item in MinBWs.");
15734 V = FinalShuffle(V, E);
15736 E->VectorizedValue =
V;
15737 ++NumVectorInstructions;
15741 case Instruction::Add:
15742 case Instruction::FAdd:
15743 case Instruction::Sub:
15744 case Instruction::FSub:
15745 case Instruction::Mul:
15746 case Instruction::FMul:
15747 case Instruction::UDiv:
15748 case Instruction::SDiv:
15749 case Instruction::FDiv:
15750 case Instruction::URem:
15751 case Instruction::SRem:
15752 case Instruction::FRem:
15753 case Instruction::Shl:
15754 case Instruction::LShr:
15755 case Instruction::AShr:
15756 case Instruction::And:
15757 case Instruction::Or:
15758 case Instruction::Xor: {
15759 setInsertPointAfterBundle(E);
15761 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15762 if (E->VectorizedValue) {
15763 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15764 return E->VectorizedValue;
15766 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15767 if (E->VectorizedValue) {
15768 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15769 return E->VectorizedValue;
15771 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15772 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15775 auto *CI = dyn_cast<ConstantInt>(
Op);
15776 return CI && CI->getValue().countr_one() >= It->second.first;
15778 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15779 E->VectorizedValue =
V;
15780 ++NumVectorInstructions;
15787 getOperandEntry(E, 1)->
isGather() ||
15788 MinBWs.
contains(getOperandEntry(E, 0)) ||
15789 MinBWs.
contains(getOperandEntry(E, 1))) &&
15790 "Expected item in MinBWs.");
15801 if (
auto *
I = dyn_cast<Instruction>(V)) {
15804 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15806 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15808 I->setHasNoUnsignedWrap(
false);
15811 V = FinalShuffle(V, E);
15813 E->VectorizedValue =
V;
15814 ++NumVectorInstructions;
15818 case Instruction::Load: {
15821 setInsertPointAfterBundle(E);
15823 LoadInst *LI = cast<LoadInst>(VL0);
15826 if (E->State == TreeEntry::Vectorize) {
15828 }
else if (E->State == TreeEntry::StridedVectorize) {
15829 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15830 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15831 PO = IsReverseOrder ? PtrN : Ptr0;
15837 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15839 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15840 DL->getTypeAllocSize(ScalarTy));
15844 return cast<LoadInst>(V)->getPointerOperand();
15847 std::optional<Value *> Stride =
15856 (IsReverseOrder ? -1 : 1) *
15857 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15859 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15861 Intrinsic::experimental_vp_strided_load,
15862 {VecTy, PO->
getType(), StrideTy},
15864 Builder.
getInt32(E->Scalars.size())});
15870 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15871 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15872 if (E->VectorizedValue) {
15873 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15874 return E->VectorizedValue;
15876 if (isa<FixedVectorType>(ScalarTy)) {
15880 unsigned ScalarTyNumElements =
15881 cast<FixedVectorType>(ScalarTy)->getNumElements();
15882 unsigned VecTyNumElements =
15883 cast<FixedVectorType>(VecTy)->getNumElements();
15884 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15885 "Cannot expand getelementptr.");
15886 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15889 return Builder.getInt64(I % ScalarTyNumElements);
15898 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15903 V = FinalShuffle(V, E);
15904 E->VectorizedValue =
V;
15905 ++NumVectorInstructions;
15908 case Instruction::Store: {
15909 auto *
SI = cast<StoreInst>(VL0);
15911 setInsertPointAfterBundle(E);
15913 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15914 if (VecValue->
getType() != VecTy)
15916 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15917 VecValue = FinalShuffle(VecValue, E);
15921 if (E->State == TreeEntry::Vectorize) {
15924 assert(E->State == TreeEntry::StridedVectorize &&
15925 "Expected either strided or consecutive stores.");
15926 if (!E->ReorderIndices.empty()) {
15927 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15928 Ptr =
SI->getPointerOperand();
15930 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15931 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15933 Intrinsic::experimental_vp_strided_store,
15934 {VecTy,
Ptr->getType(), StrideTy},
15937 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15939 Builder.
getInt32(E->Scalars.size())});
15948 E->VectorizedValue =
V;
15949 ++NumVectorInstructions;
15952 case Instruction::GetElementPtr: {
15953 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15954 setInsertPointAfterBundle(E);
15956 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15957 if (E->VectorizedValue) {
15958 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15959 return E->VectorizedValue;
15963 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
15964 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15965 if (E->VectorizedValue) {
15966 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15967 return E->VectorizedValue;
15972 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
15973 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
15975 for (
Value *V : E->Scalars) {
15976 if (isa<GetElementPtrInst>(V))
15982 V = FinalShuffle(V, E);
15984 E->VectorizedValue =
V;
15985 ++NumVectorInstructions;
15989 case Instruction::Call: {
15990 CallInst *CI = cast<CallInst>(VL0);
15991 setInsertPointAfterBundle(E);
15997 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16000 VecCallCosts.first <= VecCallCosts.second;
16002 Value *ScalarArg =
nullptr;
16008 auto *CEI = cast<CallInst>(VL0);
16009 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16014 ScalarArg = CEI->getArgOperand(
I);
16017 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16018 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16026 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16027 if (E->VectorizedValue) {
16028 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16029 return E->VectorizedValue;
16031 ScalarArg = CEI->getArgOperand(
I);
16032 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16034 It == MinBWs.
end()) {
16037 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16038 }
else if (It != MinBWs.
end()) {
16039 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16048 if (!UseIntrinsic) {
16064 V = FinalShuffle(V, E);
16066 E->VectorizedValue =
V;
16067 ++NumVectorInstructions;
16070 case Instruction::ShuffleVector: {
16072 if (
SLPReVec && !E->isAltShuffle()) {
16073 setInsertPointAfterBundle(E);
16074 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16075 if (E->VectorizedValue) {
16076 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16077 return E->VectorizedValue;
16080 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16081 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16082 "Not supported shufflevector usage.");
16085 return SVSrc->getShuffleMask()[Mask];
16092 if (
auto *
I = dyn_cast<Instruction>(V))
16094 V = FinalShuffle(V, E);
16096 assert(E->isAltShuffle() &&
16101 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16102 "Invalid Shuffle Vector Operand");
16106 setInsertPointAfterBundle(E);
16107 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16108 if (E->VectorizedValue) {
16109 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16110 return E->VectorizedValue;
16112 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16114 setInsertPointAfterBundle(E);
16115 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16117 if (E->VectorizedValue) {
16118 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16119 return E->VectorizedValue;
16126 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16127 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16128 MinBWs.
contains(getOperandEntry(E, 0)) ||
16129 MinBWs.
contains(getOperandEntry(E, 1))) &&
16130 "Expected item in MinBWs.");
16131 Type *CastTy = VecTy;
16135 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16137 ->getIntegerBitWidth())
16154 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16155 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16156 auto *AltCI = cast<CmpInst>(E->getAltOp());
16158 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16161 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16162 cast<VectorType>(
LHS->
getType())->getElementType());
16163 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16164 if (BWSz <= SrcBWSz) {
16165 if (BWSz < SrcBWSz)
16168 "Expected same type as operand.");
16169 if (
auto *
I = dyn_cast<Instruction>(LHS))
16171 LHS = FinalShuffle(LHS, E);
16172 E->VectorizedValue =
LHS;
16173 ++NumVectorInstructions;
16184 for (
Value *V : {V0, V1}) {
16185 if (
auto *
I = dyn_cast<Instruction>(V)) {
16186 GatherShuffleExtractSeq.
insert(
I);
16187 CSEBlocks.
insert(
I->getParent());
16196 E->buildAltOpShuffleMask(
16198 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16202 Mask, &OpScalars, &AltScalars);
16206 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16208 if (
auto *
I = dyn_cast<Instruction>(Vec);
16209 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16211 if (isa<PoisonValue>(V))
16213 auto *IV = cast<Instruction>(V);
16214 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16216 I->setHasNoUnsignedWrap(
false);
16218 DropNuwFlag(V0, E->getOpcode());
16219 DropNuwFlag(V1, E->getAltOpcode());
16221 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16226 if (
auto *
I = dyn_cast<Instruction>(V)) {
16228 GatherShuffleExtractSeq.
insert(
I);
16229 CSEBlocks.
insert(
I->getParent());
16233 E->VectorizedValue =
V;
16234 ++NumVectorInstructions;
16253 for (
auto &BSIter : BlocksSchedules) {
16254 scheduleBlock(BSIter.second.get());
16258 EntryToLastInstruction.
clear();
16268 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16269 if (GatheredLoadsEntriesFirst.has_value() &&
16270 TE->Idx >= *GatheredLoadsEntriesFirst &&
16271 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16272 assert((!TE->UserTreeIndices.empty() ||
16273 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16274 "Expected gathered load node.");
16280 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16281 if (TE->State == TreeEntry::Vectorize &&
16282 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16283 TE->VectorizedValue)
16289 for (
const TreeEntry *E : PostponedNodes) {
16290 auto *TE =
const_cast<TreeEntry *
>(E);
16291 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16292 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16293 TE->UserTreeIndices.front().EdgeIdx)) &&
16294 VecTE->isSame(TE->Scalars))
16298 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16299 TE->VectorizedValue =
nullptr;
16301 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16310 if (isa<PHINode>(UserI)) {
16313 for (
User *U : PrevVec->users()) {
16316 auto *UI = dyn_cast<Instruction>(U);
16317 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16319 if (UI->comesBefore(InsertPt))
16328 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16333 if (Vec->
getType() != PrevVec->getType()) {
16335 PrevVec->getType()->isIntOrIntVectorTy() &&
16336 "Expected integer vector types only.");
16337 std::optional<bool> IsSigned;
16338 for (
Value *V : TE->Scalars) {
16339 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16340 auto It = MinBWs.
find(BaseTE);
16341 if (It != MinBWs.
end()) {
16342 IsSigned = IsSigned.value_or(
false) || It->second.second;
16346 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16347 auto It = MinBWs.
find(MNTE);
16348 if (It != MinBWs.
end()) {
16349 IsSigned = IsSigned.value_or(
false) || It->second.second;
16354 if (IsSigned.value_or(
false))
16357 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16358 auto It = MinBWs.
find(BVE);
16359 if (It != MinBWs.
end()) {
16360 IsSigned = IsSigned.value_or(
false) || It->second.second;
16365 if (IsSigned.value_or(
false))
16367 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16369 IsSigned.value_or(
false) ||
16373 if (IsSigned.value_or(
false))
16377 if (IsSigned.value_or(
false)) {
16379 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16380 if (It != MinBWs.
end())
16381 IsSigned = It->second.second;
16384 "Expected user node or perfect diamond match in MinBWs.");
16388 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16391 auto It = PostponedValues.
find(PrevVec);
16392 if (It != PostponedValues.
end()) {
16393 for (TreeEntry *VTE : It->getSecond())
16394 VTE->VectorizedValue = Vec;
16414 for (
const auto &ExternalUse : ExternalUses) {
16415 Value *Scalar = ExternalUse.Scalar;
16422 TreeEntry *E = getTreeEntry(Scalar);
16423 assert(E &&
"Invalid scalar");
16424 assert(!E->isGather() &&
"Extracting from a gather list");
16426 if (E->getOpcode() == Instruction::GetElementPtr &&
16427 !isa<GetElementPtrInst>(Scalar))
16430 Value *Vec = E->VectorizedValue;
16431 assert(Vec &&
"Can't find vectorizable value");
16434 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16435 if (Scalar->getType() != Vec->
getType()) {
16436 Value *Ex =
nullptr;
16437 Value *ExV =
nullptr;
16438 auto *Inst = dyn_cast<Instruction>(Scalar);
16439 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16440 auto It = ScalarToEEs.
find(Scalar);
16441 if (It != ScalarToEEs.
end()) {
16444 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16446 if (EEIt != It->second.end()) {
16447 Value *PrevV = EEIt->second.first;
16448 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16449 I && !ReplaceInst &&
16454 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16458 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16466 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16467 IgnoredExtracts.
insert(EE);
16470 auto *CloneInst = Inst->clone();
16471 CloneInst->insertBefore(Inst);
16472 if (Inst->hasName())
16476 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16477 ES && isa<Instruction>(Vec)) {
16478 Value *V = ES->getVectorOperand();
16479 auto *IVec = cast<Instruction>(Vec);
16480 if (
const TreeEntry *ETE = getTreeEntry(V))
16481 V = ETE->VectorizedValue;
16482 if (
auto *
IV = dyn_cast<Instruction>(V);
16483 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16484 IV->comesBefore(IVec))
16488 }
else if (
auto *VecTy =
16489 dyn_cast<FixedVectorType>(Scalar->getType())) {
16498 Vec, Builder.
getInt64(ExternalUse.Lane * VecTyNumElements));
16505 if (Scalar->getType() != Ex->
getType())
16507 Ex, Scalar->getType(),
16509 auto *
I = dyn_cast<Instruction>(Ex);
16511 : &
F->getEntryBlock(),
16512 std::make_pair(Ex, ExV));
16516 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16518 GatherShuffleExtractSeq.
insert(ExI);
16519 CSEBlocks.
insert(ExI->getParent());
16523 assert(isa<FixedVectorType>(Scalar->getType()) &&
16524 isa<InsertElementInst>(Scalar) &&
16525 "In-tree scalar of vector type is not insertelement?");
16526 auto *IE = cast<InsertElementInst>(Scalar);
16534 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16538 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16541 if (ExternalUsesAsOriginalScalar.contains(U))
16543 TreeEntry *UseEntry = getTreeEntry(U);
16545 (UseEntry->State == TreeEntry::Vectorize ||
16547 TreeEntry::StridedVectorize) &&
16548 (E->State == TreeEntry::Vectorize ||
16549 E->State == TreeEntry::StridedVectorize) &&
16550 doesInTreeUserNeedToExtract(
16551 Scalar, getRootEntryInstruction(*UseEntry),
16554 "Scalar with nullptr User must be registered in "
16555 "ExternallyUsedValues map or remain as scalar in vectorized "
16557 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16558 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16559 if (
PHI->getParent()->isLandingPad())
16563 PHI->getParent()->getLandingPadInst()->getIterator()));
16566 PHI->getParent()->getFirstNonPHIIt());
16569 std::next(VecI->getIterator()));
16574 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16576 if (Scalar != NewInst) {
16577 assert((!isa<ExtractElementInst>(Scalar) ||
16578 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16579 "Extractelements should not be replaced.");
16580 Scalar->replaceAllUsesWith(NewInst);
16585 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16588 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16589 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16590 if (!UsedInserts.
insert(VU).second)
16593 auto BWIt = MinBWs.
find(E);
16595 auto *ScalarTy = FTy->getElementType();
16596 auto Key = std::make_pair(Vec, ScalarTy);
16597 auto VecIt = VectorCasts.
find(Key);
16598 if (VecIt == VectorCasts.
end()) {
16600 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16601 if (IVec->getParent()->isLandingPad())
16603 std::next(IVec->getParent()
16604 ->getLandingPadInst()
16608 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16609 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16616 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16617 BWIt->second.second);
16620 Vec = VecIt->second;
16627 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16634 unsigned Idx = *InsertIdx;
16635 if (It == ShuffledInserts.
end()) {
16637 It = std::next(ShuffledInserts.
begin(),
16638 ShuffledInserts.
size() - 1);
16643 Mask[
Idx] = ExternalUse.Lane;
16644 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16653 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16655 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16656 if (PH->getIncomingValue(
I) == Scalar) {
16658 PH->getIncomingBlock(
I)->getTerminator();
16659 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16661 std::next(VecI->getIterator()));
16665 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16666 PH->setOperand(
I, NewInst);
16671 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16676 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16686 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16687 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16689 CombinedMask1[
I] = Mask[
I];
16691 CombinedMask2[
I] = Mask[
I] - VF;
16694 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16695 ShuffleBuilder.
add(V1, CombinedMask1);
16697 ShuffleBuilder.
add(V2, CombinedMask2);
16698 return ShuffleBuilder.
finalize({}, {}, {});
16702 bool ForSingleMask) {
16703 unsigned VF = Mask.size();
16704 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16706 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16707 Vec = CreateShuffle(Vec,
nullptr, Mask);
16708 return std::make_pair(Vec,
true);
16710 if (!ForSingleMask) {
16712 for (
unsigned I = 0;
I < VF; ++
I) {
16714 ResizeMask[Mask[
I]] = Mask[
I];
16716 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16720 return std::make_pair(Vec,
false);
16724 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16730 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16731 Value *NewInst = performExtractsShuffleAction<Value>(
16735 return cast<VectorType>(Vec->getType())
16736 ->getElementCount()
16737 .getKnownMinValue();
16742 assert((Vals.size() == 1 || Vals.size() == 2) &&
16743 "Expected exactly 1 or 2 input values.");
16744 if (Vals.size() == 1) {
16747 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16748 ->getNumElements() ||
16749 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16750 return CreateShuffle(Vals.front(), nullptr, Mask);
16751 return Vals.front();
16753 return CreateShuffle(Vals.
front() ? Vals.
front()
16755 Vals.
back(), Mask);
16757 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16760 if (It != ShuffledInserts[
I].InsertElements.
rend())
16763 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16764 assert(
II &&
"Must be an insertelement instruction.");
16769 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16772 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16773 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16774 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16775 II->moveAfter(NewI);
16778 LastInsert->replaceAllUsesWith(NewInst);
16780 IE->replaceUsesOfWith(IE->getOperand(0),
16782 IE->replaceUsesOfWith(IE->getOperand(1),
16786 CSEBlocks.
insert(LastInsert->getParent());
16791 for (
auto &TEPtr : VectorizableTree) {
16792 TreeEntry *Entry = TEPtr.get();
16795 if (Entry->isGather())
16798 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16801 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16802 Value *Scalar = Entry->Scalars[Lane];
16804 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16805 !isa<GetElementPtrInst>(Scalar))
16807 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16808 EE && IgnoredExtracts.contains(EE))
16810 if (isa<PoisonValue>(Scalar))
16813 Type *Ty = Scalar->getType();
16815 for (
User *U : Scalar->users()) {
16819 assert((getTreeEntry(U) ||
16820 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16821 (isa_and_nonnull<Instruction>(U) &&
16822 isDeleted(cast<Instruction>(U)))) &&
16823 "Deleting out-of-tree value");
16827 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16828 auto *
I = cast<Instruction>(Scalar);
16835 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16836 V->mergeDIAssignID(RemovedInsts);
16839 if (UserIgnoreList) {
16841 const TreeEntry *
IE = getTreeEntry(
I);
16842 if (
IE->Idx != 0 &&
16843 !(VectorizableTree.front()->isGather() &&
16844 !
IE->UserTreeIndices.empty() &&
16845 (ValueToGatherNodes.lookup(
I).contains(
16846 VectorizableTree.front().get()) ||
16848 [&](
const EdgeInfo &EI) {
16849 return EI.UserTE == VectorizableTree.front().get() &&
16850 EI.EdgeIdx == UINT_MAX;
16852 !(GatheredLoadsEntriesFirst.has_value() &&
16853 IE->Idx >= *GatheredLoadsEntriesFirst &&
16854 VectorizableTree.front()->isGather() &&
16860 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16861 (match(U.getUser(), m_LogicalAnd()) ||
16862 match(U.getUser(), m_LogicalOr())) &&
16863 U.getOperandNo() == 0;
16864 if (IsPoisoningLogicalOp) {
16865 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16868 return UserIgnoreList->contains(
U.getUser());
16880 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16883 InstrElementSize.
clear();
16885 const TreeEntry &RootTE = *VectorizableTree.front();
16886 Value *Vec = RootTE.VectorizedValue;
16887 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16888 It != MinBWs.end() &&
16889 ReductionBitWidth != It->second.first) {
16892 ReductionRoot->getIterator());
16896 cast<VectorType>(Vec->
getType())->getElementCount()),
16897 It->second.second);
16904 <<
" gather sequences instructions.\n");
16911 Loop *L = LI->getLoopFor(
I->getParent());
16916 BasicBlock *PreHeader = L->getLoopPreheader();
16924 auto *OpI = dyn_cast<Instruction>(V);
16925 return OpI && L->contains(OpI);
16931 CSEBlocks.
insert(PreHeader);
16946 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
16947 "Different nodes should have different DFS numbers");
16948 return A->getDFSNumIn() <
B->getDFSNumIn();
16958 if (I1->getType() != I2->getType())
16960 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16961 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16963 return I1->isIdenticalTo(I2);
16964 if (SI1->isIdenticalTo(SI2))
16966 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
16967 if (SI1->getOperand(
I) != SI2->getOperand(
I))
16970 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16974 unsigned LastUndefsCnt = 0;
16975 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
16981 NewMask[
I] != SM1[
I])
16984 NewMask[
I] = SM1[
I];
16988 return SM1.
size() - LastUndefsCnt > 1 &&
16992 SM1.
size() - LastUndefsCnt));
16998 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17001 "Worklist not sorted properly!");
17007 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17008 !GatherShuffleExtractSeq.contains(&In))
17013 bool Replaced =
false;
17016 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17017 DT->
dominates(V->getParent(), In.getParent())) {
17018 In.replaceAllUsesWith(V);
17020 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17021 if (!NewMask.
empty())
17022 SI->setShuffleMask(NewMask);
17026 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17027 GatherShuffleExtractSeq.contains(V) &&
17028 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17029 DT->
dominates(In.getParent(), V->getParent())) {
17031 V->replaceAllUsesWith(&In);
17033 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17034 if (!NewMask.
empty())
17035 SI->setShuffleMask(NewMask);
17043 Visited.push_back(&In);
17048 GatherShuffleExtractSeq.clear();
17051BoUpSLP::ScheduleData *
17053 ScheduleData *Bundle =
nullptr;
17054 ScheduleData *PrevInBundle =
nullptr;
17055 for (
Value *V : VL) {
17058 ScheduleData *BundleMember = getScheduleData(V);
17060 "no ScheduleData for bundle member "
17061 "(maybe not in same basic block)");
17062 assert(BundleMember->isSchedulingEntity() &&
17063 "bundle member already part of other bundle");
17064 if (PrevInBundle) {
17065 PrevInBundle->NextInBundle = BundleMember;
17067 Bundle = BundleMember;
17071 BundleMember->FirstInBundle = Bundle;
17072 PrevInBundle = BundleMember;
17074 assert(Bundle &&
"Failed to find schedule bundle");
17080std::optional<BoUpSLP::ScheduleData *>
17082 const InstructionsState &S) {
17085 if (isa<PHINode>(S.getMainOp()) ||
17091 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17093 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17094 ScheduleData *Bundle) {
17100 if (ScheduleEnd != OldScheduleEnd) {
17101 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17102 if (ScheduleData *SD = getScheduleData(
I))
17103 SD->clearDependencies();
17108 <<
" in block " << BB->
getName() <<
"\n");
17109 calculateDependencies(Bundle,
true, SLP);
17114 initialFillReadyList(ReadyInsts);
17121 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17122 !ReadyInsts.empty()) {
17123 ScheduleData *Picked = ReadyInsts.pop_back_val();
17124 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17125 "must be ready to schedule");
17126 schedule(Picked, ReadyInsts);
17132 for (
Value *V : VL) {
17135 if (!extendSchedulingRegion(V, S)) {
17142 TryScheduleBundleImpl(
false,
nullptr);
17143 return std::nullopt;
17147 bool ReSchedule =
false;
17148 for (
Value *V : VL) {
17151 ScheduleData *BundleMember = getScheduleData(V);
17153 "no ScheduleData for bundle member (maybe not in same basic block)");
17157 ReadyInsts.remove(BundleMember);
17159 if (!BundleMember->IsScheduled)
17164 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17165 <<
" was already scheduled\n");
17169 auto *Bundle = buildBundle(VL);
17170 TryScheduleBundleImpl(ReSchedule, Bundle);
17171 if (!Bundle->isReady()) {
17172 cancelScheduling(VL, S.getMainOp());
17173 return std::nullopt;
17186 ScheduleData *Bundle = getScheduleData(OpValue);
17187 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17188 assert(!Bundle->IsScheduled &&
17189 "Can't cancel bundle which is already scheduled");
17190 assert(Bundle->isSchedulingEntity() &&
17192 "tried to unbundle something which is not a bundle");
17195 if (Bundle->isReady())
17196 ReadyInsts.remove(Bundle);
17199 ScheduleData *BundleMember = Bundle;
17200 while (BundleMember) {
17201 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17202 BundleMember->FirstInBundle = BundleMember;
17203 ScheduleData *Next = BundleMember->NextInBundle;
17204 BundleMember->NextInBundle =
nullptr;
17205 BundleMember->TE =
nullptr;
17206 if (BundleMember->unscheduledDepsInBundle() == 0) {
17207 ReadyInsts.insert(BundleMember);
17209 BundleMember = Next;
17213BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17215 if (ChunkPos >= ChunkSize) {
17216 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17219 return &(ScheduleDataChunks.back()[ChunkPos++]);
17222bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17223 Value *V,
const InstructionsState &S) {
17225 assert(
I &&
"bundle member must be an instruction");
17228 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17230 if (getScheduleData(
I))
17232 if (!ScheduleStart) {
17234 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17236 ScheduleEnd =
I->getNextNode();
17237 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17238 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17246 ++ScheduleStart->getIterator().getReverse();
17251 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17252 return II->isAssumeLikeIntrinsic();
17255 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17256 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17257 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17259 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17260 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17267 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17268 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17270 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17271 assert(
I->getParent() == ScheduleStart->getParent() &&
17272 "Instruction is in wrong basic block.");
17273 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17279 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17280 "Expected to reach top of the basic block or instruction down the "
17282 assert(
I->getParent() == ScheduleEnd->getParent() &&
17283 "Instruction is in wrong basic block.");
17284 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17286 ScheduleEnd =
I->getNextNode();
17287 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17288 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17292void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17294 ScheduleData *PrevLoadStore,
17295 ScheduleData *NextLoadStore) {
17296 ScheduleData *CurrentLoadStore = PrevLoadStore;
17301 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17303 SD = allocateScheduleDataChunks();
17304 ScheduleDataMap[
I] = SD;
17306 assert(!isInSchedulingRegion(SD) &&
17307 "new ScheduleData already in scheduling region");
17308 SD->init(SchedulingRegionID,
I);
17310 if (
I->mayReadOrWriteMemory() &&
17311 (!isa<IntrinsicInst>(
I) ||
17312 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17313 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17314 Intrinsic::pseudoprobe))) {
17316 if (CurrentLoadStore) {
17317 CurrentLoadStore->NextLoadStore = SD;
17319 FirstLoadStoreInRegion = SD;
17321 CurrentLoadStore = SD;
17324 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17325 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17326 RegionHasStackSave =
true;
17328 if (NextLoadStore) {
17329 if (CurrentLoadStore)
17330 CurrentLoadStore->NextLoadStore = NextLoadStore;
17332 LastLoadStoreInRegion = CurrentLoadStore;
17336void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17337 bool InsertInReadyList,
17339 assert(SD->isSchedulingEntity());
17344 while (!WorkList.
empty()) {
17346 for (ScheduleData *BundleMember = SD; BundleMember;
17347 BundleMember = BundleMember->NextInBundle) {
17348 assert(isInSchedulingRegion(BundleMember));
17349 if (BundleMember->hasValidDependencies())
17354 BundleMember->Dependencies = 0;
17355 BundleMember->resetUnscheduledDeps();
17358 for (
User *U : BundleMember->Inst->
users()) {
17359 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17360 BundleMember->Dependencies++;
17361 ScheduleData *DestBundle = UseSD->FirstInBundle;
17362 if (!DestBundle->IsScheduled)
17363 BundleMember->incrementUnscheduledDeps(1);
17364 if (!DestBundle->hasValidDependencies())
17370 auto *DepDest = getScheduleData(
I);
17371 assert(DepDest &&
"must be in schedule window");
17372 DepDest->ControlDependencies.push_back(BundleMember);
17373 BundleMember->Dependencies++;
17374 ScheduleData *DestBundle = DepDest->FirstInBundle;
17375 if (!DestBundle->IsScheduled)
17376 BundleMember->incrementUnscheduledDeps(1);
17377 if (!DestBundle->hasValidDependencies())
17385 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17386 I != ScheduleEnd;
I =
I->getNextNode()) {
17391 MakeControlDependent(
I);
17399 if (RegionHasStackSave) {
17403 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17404 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17405 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17406 I != ScheduleEnd;
I =
I->getNextNode()) {
17407 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17408 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17413 if (!isa<AllocaInst>(
I))
17417 MakeControlDependent(
I);
17426 if (isa<AllocaInst>(BundleMember->Inst) ||
17427 BundleMember->Inst->mayReadOrWriteMemory()) {
17428 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17429 I != ScheduleEnd;
I =
I->getNextNode()) {
17430 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17431 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17435 MakeControlDependent(
I);
17442 ScheduleData *DepDest = BundleMember->NextLoadStore;
17447 "NextLoadStore list for non memory effecting bundle?");
17449 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17450 unsigned NumAliased = 0;
17451 unsigned DistToSrc = 1;
17453 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17454 assert(isInSchedulingRegion(DepDest));
17464 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17466 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17473 DepDest->MemoryDependencies.push_back(BundleMember);
17474 BundleMember->Dependencies++;
17475 ScheduleData *DestBundle = DepDest->FirstInBundle;
17476 if (!DestBundle->IsScheduled) {
17477 BundleMember->incrementUnscheduledDeps(1);
17479 if (!DestBundle->hasValidDependencies()) {
17502 if (InsertInReadyList && SD->isReady()) {
17503 ReadyInsts.insert(SD);
17510void BoUpSLP::BlockScheduling::resetSchedule() {
17512 "tried to reset schedule on block which has not been scheduled");
17513 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17514 if (ScheduleData *SD = getScheduleData(
I)) {
17515 assert(isInSchedulingRegion(SD) &&
17516 "ScheduleData not in scheduling region");
17517 SD->IsScheduled =
false;
17518 SD->resetUnscheduledDeps();
17521 ReadyInsts.clear();
17524void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17525 if (!BS->ScheduleStart)
17528 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17535 BS->resetSchedule();
17542 struct ScheduleDataCompare {
17543 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17544 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17547 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17552 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17553 I =
I->getNextNode()) {
17554 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17555 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17558 SD->isPartOfBundle() ==
17560 "scheduler and vectorizer bundle mismatch");
17561 SD->FirstInBundle->SchedulingPriority =
Idx++;
17563 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17564 BS->calculateDependencies(SD,
false,
this);
17567 BS->initialFillReadyList(ReadyInsts);
17569 Instruction *LastScheduledInst = BS->ScheduleEnd;
17572 while (!ReadyInsts.empty()) {
17573 ScheduleData *Picked = *ReadyInsts.begin();
17574 ReadyInsts.erase(ReadyInsts.begin());
17578 for (ScheduleData *BundleMember = Picked; BundleMember;
17579 BundleMember = BundleMember->NextInBundle) {
17583 LastScheduledInst = PickedInst;
17586 BS->schedule(Picked, ReadyInsts);
17590#ifdef EXPENSIVE_CHECKS
17594#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17596 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17597 ScheduleData *SD = BS->getScheduleData(
I);
17598 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17599 assert(SD->IsScheduled &&
"must be scheduled at this point");
17604 BS->ScheduleStart =
nullptr;
17611 if (
auto *Store = dyn_cast<StoreInst>(V))
17612 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17614 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17617 auto E = InstrElementSize.
find(V);
17618 if (E != InstrElementSize.
end())
17627 if (
auto *
I = dyn_cast<Instruction>(V)) {
17635 Value *FirstNonBool =
nullptr;
17636 while (!Worklist.
empty()) {
17641 auto *Ty =
I->getType();
17642 if (isa<VectorType>(Ty))
17644 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17651 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17652 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17660 for (
Use &U :
I->operands()) {
17661 if (
auto *J = dyn_cast<Instruction>(U.get()))
17662 if (Visited.
insert(J).second &&
17663 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17667 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17668 FirstNonBool = U.get();
17679 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17681 Width =
DL->getTypeSizeInBits(V->getType());
17685 InstrElementSize[
I] = Width;
17690bool BoUpSLP::collectValuesToDemote(
17691 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17694 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17696 if (
all_of(E.Scalars, IsaPred<Constant>))
17699 unsigned OrigBitWidth =
17700 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17707 if (NodesToKeepBWs.
contains(E.Idx))
17713 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17714 if (isa<PoisonValue>(R))
17716 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17718 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17719 if (isa<PoisonValue>(V))
17727 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17733 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17736 if (
auto *
I = dyn_cast<Instruction>(V)) {
17738 unsigned BitWidth2 =
17739 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17740 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17746 BitWidth1 = std::min(BitWidth1, BitWidth2);
17751 using namespace std::placeholders;
17752 auto FinalAnalysis = [&]() {
17753 if (!IsProfitableToDemote)
17756 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17758 if (Res && E.isGather()) {
17762 for (
Value *V : E.Scalars) {
17763 auto *EE = dyn_cast<ExtractElementInst>(V);
17766 UniqueBases.
insert(EE->getVectorOperand());
17768 const unsigned VF = E.Scalars.size();
17769 Type *OrigScalarTy = E.Scalars.front()->getType();
17770 if (UniqueBases.
size() <= 2 ||
17778 if (E.isGather() || !Visited.
insert(&E).second ||
17780 return all_of(V->users(), [&](User *U) {
17781 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17784 return FinalAnalysis();
17787 return !all_of(V->users(), [=](User *U) {
17788 return getTreeEntry(U) ||
17789 (E.Idx == 0 && UserIgnoreList &&
17790 UserIgnoreList->contains(U)) ||
17791 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17792 !U->getType()->isScalableTy() &&
17793 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17794 }) && !IsPotentiallyTruncated(V,
BitWidth);
17799 bool &NeedToExit) {
17800 NeedToExit =
false;
17801 unsigned InitLevel = MaxDepthLevel;
17803 unsigned Level = InitLevel;
17804 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17805 ToDemote, Visited, NodesToKeepBWs, Level,
17806 IsProfitableToDemote, IsTruncRoot)) {
17807 if (!IsProfitableToDemote)
17810 if (!FinalAnalysis())
17814 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17818 auto AttemptCheckBitwidth =
17821 NeedToExit =
false;
17822 unsigned BestFailBitwidth = 0;
17824 if (Checker(
BitWidth, OrigBitWidth))
17826 if (BestFailBitwidth == 0 && FinalAnalysis())
17830 if (BestFailBitwidth == 0) {
17841 auto TryProcessInstruction =
17847 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17852 if (E.UserTreeIndices.size() > 1 &&
17853 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17856 bool NeedToExit =
false;
17857 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17861 if (!ProcessOperands(
Operands, NeedToExit))
17870 return IsProfitableToDemote;
17872 switch (E.getOpcode()) {
17876 case Instruction::Trunc:
17877 if (IsProfitableToDemoteRoot)
17878 IsProfitableToDemote =
true;
17879 return TryProcessInstruction(
BitWidth);
17880 case Instruction::ZExt:
17881 case Instruction::SExt:
17882 IsProfitableToDemote =
true;
17883 return TryProcessInstruction(
BitWidth);
17887 case Instruction::Add:
17888 case Instruction::Sub:
17889 case Instruction::Mul:
17890 case Instruction::And:
17891 case Instruction::Or:
17892 case Instruction::Xor: {
17893 return TryProcessInstruction(
17894 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17896 case Instruction::Freeze:
17897 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17898 case Instruction::Shl: {
17903 if (isa<PoisonValue>(V))
17905 auto *I = cast<Instruction>(V);
17906 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17907 return AmtKnownBits.getMaxValue().ult(BitWidth);
17910 return TryProcessInstruction(
17911 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17913 case Instruction::LShr: {
17917 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17919 if (isa<PoisonValue>(V))
17921 auto *I = cast<Instruction>(V);
17922 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17923 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17924 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17925 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17926 SimplifyQuery(*DL));
17929 return TryProcessInstruction(
17930 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17933 case Instruction::AShr: {
17937 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17939 if (isa<PoisonValue>(V))
17941 auto *I = cast<Instruction>(V);
17942 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17943 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17944 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17945 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17949 return TryProcessInstruction(
17950 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17953 case Instruction::UDiv:
17954 case Instruction::URem: {
17956 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17959 auto *I = cast<Instruction>(V);
17960 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17961 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17962 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17965 return TryProcessInstruction(
17966 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17970 case Instruction::Select: {
17971 return TryProcessInstruction(
17972 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
17977 case Instruction::PHI: {
17978 const unsigned NumOps = E.getNumOperands();
17981 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
17983 return TryProcessInstruction(
BitWidth, Ops);
17986 case Instruction::Call: {
17987 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
17991 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
17992 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
17996 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17999 auto *I = cast<Instruction>(V);
18000 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18001 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18002 return MaskedValueIsZero(I->getOperand(0), Mask,
18003 SimplifyQuery(*DL)) &&
18004 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18006 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18007 "Expected min/max intrinsics only.");
18008 unsigned SignBits = OrigBitWidth -
BitWidth;
18014 return SignBits <= Op0SignBits &&
18015 ((SignBits != Op0SignBits &&
18019 SignBits <= Op1SignBits &&
18020 ((SignBits != Op1SignBits &&
18025 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18028 auto *I = cast<Instruction>(V);
18029 unsigned SignBits = OrigBitWidth - BitWidth;
18030 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18031 unsigned Op0SignBits =
18032 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18033 return SignBits <= Op0SignBits &&
18034 ((SignBits != Op0SignBits &&
18035 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18036 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18039 if (
ID != Intrinsic::abs) {
18040 Operands.push_back(getOperandEntry(&E, 1));
18041 CallChecker = CompChecker;
18043 CallChecker = AbsChecker;
18046 std::numeric_limits<InstructionCost::CostType>::max();
18048 unsigned VF = E.Scalars.size();
18058 if (
Cost < BestCost) {
18064 [[maybe_unused]]
bool NeedToExit;
18065 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18075 return FinalAnalysis();
18082 bool IsStoreOrInsertElt =
18083 VectorizableTree.front()->getOpcode() == Instruction::Store ||
18084 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
18085 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18086 ExtraBitWidthNodes.
size() <= 1 &&
18087 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18088 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18091 unsigned NodeIdx = 0;
18092 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18096 if (VectorizableTree[NodeIdx]->
isGather() ||
18097 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18098 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18100 return EI.
UserTE->Idx > NodeIdx;
18106 bool IsTruncRoot =
false;
18107 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18110 if (NodeIdx != 0 &&
18111 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18112 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18113 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18114 IsTruncRoot =
true;
18116 IsProfitableToDemoteRoot =
true;
18121 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18125 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
18126 bool IsProfitableToDemoteRoot,
unsigned Opcode,
18127 unsigned Limit,
bool IsTruncRoot,
18128 bool IsSignedCmp) ->
unsigned {
18132 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18133 !NodesToKeepBWs.
contains(E.Idx) &&
18134 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18136 return V->hasOneUse() || isa<Constant>(V) ||
18139 const TreeEntry *TE = getTreeEntry(U);
18140 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18141 if (TE == UserTE || !TE)
18143 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18145 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18146 SelectInst>(UserTE->getMainOp()))
18148 unsigned UserTESz = DL->getTypeSizeInBits(
18149 UserTE->Scalars.front()->getType());
18150 auto It = MinBWs.find(TE);
18151 if (It != MinBWs.end() && It->second.first > UserTESz)
18153 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18157 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18158 auto It = MinBWs.
find(UserTE);
18159 if (It != MinBWs.
end())
18160 return It->second.first;
18161 unsigned MaxBitWidth =
18162 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18163 MaxBitWidth =
bit_ceil(MaxBitWidth);
18164 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18166 return MaxBitWidth;
18169 unsigned VF = E.getVectorFactor();
18170 Type *ScalarTy = E.Scalars.front()->getType();
18172 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18173 if (!TreeRootIT || !Opcode)
18177 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18186 unsigned MaxBitWidth = 1u;
18194 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18195 if (isa<PoisonValue>(R))
18197 KnownBits Known = computeKnownBits(R, *DL);
18198 return Known.isNonNegative();
18203 for (
Value *Root : E.Scalars) {
18204 if (isa<PoisonValue>(Root))
18209 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18225 if (!IsKnownPositive)
18229 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18231 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18234 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18239 if (NumParts > 1 &&
18245 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18246 Opcode == Instruction::SExt ||
18247 Opcode == Instruction::ZExt || NumParts > 1;
18252 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18253 bool NeedToDemote = IsProfitableToDemote;
18255 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18256 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18257 NeedToDemote, IsTruncRoot) ||
18258 (MaxDepthLevel <= Limit &&
18259 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18260 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18261 DL->getTypeSizeInBits(TreeRootIT) /
18262 DL->getTypeSizeInBits(
18263 E.getMainOp()->getOperand(0)->getType()) >
18267 MaxBitWidth =
bit_ceil(MaxBitWidth);
18269 return MaxBitWidth;
18276 if (UserIgnoreList &&
18277 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18280 if (
all_of(*UserIgnoreList,
18282 return isa<PoisonValue>(V) ||
18283 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18285 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18286 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18287 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18289 ReductionBitWidth = 1;
18291 for (
Value *V : *UserIgnoreList) {
18292 if (isa<PoisonValue>(V))
18295 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18296 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18299 unsigned BitWidth2 = BitWidth1;
18302 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18304 ReductionBitWidth =
18305 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18307 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18308 ReductionBitWidth = 8;
18310 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18313 bool IsTopRoot = NodeIdx == 0;
18314 while (NodeIdx < VectorizableTree.size() &&
18315 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18316 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18319 IsTruncRoot =
true;
18321 bool IsSignedCmp =
false;
18322 while (NodeIdx < VectorizableTree.size()) {
18324 unsigned Limit = 2;
18325 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18327 ReductionBitWidth ==
18328 DL->getTypeSizeInBits(
18329 VectorizableTree.front()->Scalars.front()->getType()))
18331 unsigned MaxBitWidth = ComputeMaxBitWidth(
18332 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18333 Limit, IsTruncRoot, IsSignedCmp);
18334 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18335 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18336 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18337 else if (MaxBitWidth == 0)
18338 ReductionBitWidth = 0;
18341 for (
unsigned Idx : RootDemotes) {
18344 DL->getTypeSizeInBits(V->getType()->getScalarType());
18345 if (OrigBitWidth > MaxBitWidth) {
18353 RootDemotes.clear();
18355 IsProfitableToDemoteRoot =
true;
18357 if (ExtraBitWidthNodes.
empty()) {
18358 NodeIdx = VectorizableTree.size();
18360 unsigned NewIdx = 0;
18362 NewIdx = *ExtraBitWidthNodes.
begin();
18363 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18364 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18367 NodeIdx < VectorizableTree.size() &&
18368 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18371 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18372 !EI.
UserTE->isAltShuffle();
18375 NodeIdx < VectorizableTree.size() &&
18376 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18378 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
18380 auto *IC = dyn_cast<ICmpInst>(V);
18383 !isKnownNonNegative(IC->getOperand(0),
18384 SimplifyQuery(*DL)) ||
18385 !isKnownNonNegative(IC->getOperand(1),
18386 SimplifyQuery(*DL)));
18393 if (MaxBitWidth == 0 ||
18395 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18397 if (UserIgnoreList)
18405 for (
unsigned Idx : ToDemote) {
18406 TreeEntry *TE = VectorizableTree[
Idx].get();
18409 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18410 if (isa<PoisonValue>(R))
18412 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18430 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18455 DL = &
F.getDataLayout();
18459 bool Changed =
false;
18465 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18470 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18473 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18477 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18486 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18491 R.clearReductionData();
18492 collectSeedInstructions(BB);
18495 if (!Stores.
empty()) {
18497 <<
" underlying objects.\n");
18498 Changed |= vectorizeStoreChains(R);
18502 Changed |= vectorizeChainsInBlock(BB, R);
18507 if (!GEPs.
empty()) {
18509 <<
" underlying objects.\n");
18510 Changed |= vectorizeGEPIndices(BB, R);
18515 R.optimizeGatherSequence();
18523 unsigned Idx,
unsigned MinVF,
18528 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18529 unsigned VF = Chain.
size();
18533 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18535 VF < 2 || VF < MinVF) {
18547 for (
Value *V : Chain)
18548 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18551 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18553 bool IsAllowedSize =
18557 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18558 (!S.getMainOp()->isSafeToRemove() ||
18561 return !isa<ExtractElementInst>(V) &&
18562 (V->getNumUses() > Chain.size() ||
18563 any_of(V->users(), [&](User *U) {
18564 return !Stores.contains(U);
18567 (ValOps.
size() > Chain.size() / 2 && !S)) {
18568 Size = (!IsAllowedSize && S) ? 1 : 2;
18572 if (
R.isLoadCombineCandidate(Chain))
18574 R.buildTree(Chain);
18576 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18577 if (
R.isGathered(Chain.front()) ||
18578 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18579 return std::nullopt;
18580 Size =
R.getCanonicalGraphSize();
18583 R.reorderTopToBottom();
18584 R.reorderBottomToTop();
18585 R.transformNodes();
18586 R.buildExternalUses();
18588 R.computeMinimumValueSizes();
18590 Size =
R.getCanonicalGraphSize();
18591 if (S && S.getOpcode() == Instruction::Load)
18599 using namespace ore;
18602 cast<StoreInst>(Chain[0]))
18603 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18604 <<
" and with tree size "
18605 <<
NV(
"TreeSize",
R.getTreeSize()));
18619 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18620 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18621 unsigned Size = First ? Val.first : Val.second;
18633 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18634 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18635 unsigned P = First ? Val.first : Val.second;
18638 return V + (P - Mean) * (P - Mean);
18641 return Dev * 81 / (Mean * Mean) == 0;
18644bool SLPVectorizerPass::vectorizeStores(
18646 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18651 bool Changed =
false;
18653 struct StoreDistCompare {
18654 bool operator()(
const std::pair<unsigned, int> &Op1,
18655 const std::pair<unsigned, int> &Op2)
const {
18656 return Op1.second < Op2.second;
18661 using StoreIndexToDistSet =
18662 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18663 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18668 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18670 PrevDist =
Data.second;
18671 if (
Idx !=
Set.size() - 1)
18676 Operands.push_back(Stores[DataVar.first]);
18677 PrevDist = DataVar.second;
18682 .
insert({Operands.front(),
18683 cast<StoreInst>(Operands.front())->getValueOperand(),
18685 cast<StoreInst>(Operands.back())->getValueOperand(),
18690 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18691 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18695 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18697 Type *StoreTy =
Store->getValueOperand()->getType();
18698 Type *ValueTy = StoreTy;
18699 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18700 ValueTy = Trunc->getSrcTy();
18701 unsigned MinVF = std::max<unsigned>(
18703 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18706 if (MaxVF < MinVF) {
18707 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18709 <<
"MinVF (" << MinVF <<
")\n");
18713 unsigned NonPowerOf2VF = 0;
18718 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18720 NonPowerOf2VF = CandVF;
18721 assert(NonPowerOf2VF != MaxVF &&
18722 "Non-power-of-2 VF should not be equal to MaxVF");
18726 unsigned MaxRegVF = MaxVF;
18728 if (MaxVF < MinVF) {
18729 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18731 <<
"MinVF (" << MinVF <<
")\n");
18737 unsigned Size = MinVF;
18739 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18743 unsigned Repeat = 0;
18744 constexpr unsigned MaxAttempts = 4;
18746 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18747 P.first =
P.second = 1;
18750 auto IsNotVectorized = [](
bool First,
18751 const std::pair<unsigned, unsigned> &
P) {
18752 return First ?
P.first > 0 :
P.second > 0;
18754 auto IsVectorized = [](
bool First,
18755 const std::pair<unsigned, unsigned> &
P) {
18756 return First ?
P.first == 0 :
P.second == 0;
18758 auto VFIsProfitable = [](
bool First,
unsigned Size,
18759 const std::pair<unsigned, unsigned> &
P) {
18762 auto FirstSizeSame = [](
unsigned Size,
18763 const std::pair<unsigned, unsigned> &
P) {
18764 return Size ==
P.first;
18768 bool RepeatChanged =
false;
18769 bool AnyProfitableGraph =
false;
18770 for (
unsigned Size : CandidateVFs) {
18771 AnyProfitableGraph =
false;
18772 unsigned StartIdx = std::distance(
18773 RangeSizes.begin(),
18774 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18775 std::placeholders::_1)));
18776 while (StartIdx <
End) {
18778 std::distance(RangeSizes.begin(),
18779 find_if(RangeSizes.drop_front(StartIdx),
18780 std::bind(IsVectorized,
Size >= MaxRegVF,
18781 std::placeholders::_1)));
18782 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18783 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18785 Size >= MaxRegVF)) {
18792 return cast<StoreInst>(V)
18793 ->getValueOperand()
18795 cast<StoreInst>(Slice.
front())
18796 ->getValueOperand()
18799 "Expected all operands of same type.");
18800 if (!NonSchedulable.empty()) {
18801 auto [NonSchedSizeMax, NonSchedSizeMin] =
18802 NonSchedulable.lookup(Slice.
front());
18803 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18804 Cnt += NonSchedSizeMax;
18809 std::optional<bool> Res =
18810 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18814 .first->getSecond()
18822 AnyProfitableGraph = RepeatChanged = Changed =
true;
18826 [](std::pair<unsigned, unsigned> &
P) {
18827 P.first = P.second = 0;
18829 if (Cnt < StartIdx + MinVF) {
18830 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18831 [](std::pair<unsigned, unsigned> &
P) {
18832 P.first = P.second = 0;
18834 StartIdx = Cnt +
Size;
18836 if (Cnt > Sz -
Size - MinVF) {
18838 [](std::pair<unsigned, unsigned> &
P) {
18839 P.first = P.second = 0;
18848 if (
Size > 2 && Res &&
18850 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18851 std::placeholders::_1))) {
18857 if (
Size > MaxRegVF && TreeSize > 1 &&
18859 std::bind(FirstSizeSame, TreeSize,
18860 std::placeholders::_1))) {
18862 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18868 [&](std::pair<unsigned, unsigned> &
P) {
18869 if (Size >= MaxRegVF)
18870 P.second = std::max(P.second, TreeSize);
18872 P.first = std::max(P.first, TreeSize);
18875 AnyProfitableGraph =
true;
18877 if (StartIdx >=
End)
18879 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18880 AnyProfitableGraph =
true;
18881 StartIdx = std::distance(
18882 RangeSizes.begin(),
18883 find_if(RangeSizes.drop_front(Sz),
18884 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18885 std::placeholders::_1)));
18891 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18892 return P.first == 0 &&
P.second == 0;
18896 if (Repeat >= MaxAttempts ||
18897 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18899 constexpr unsigned StoresLimit = 64;
18900 const unsigned MaxTotalNum = std::min<unsigned>(
18902 static_cast<unsigned>(
18905 RangeSizes.begin(),
18906 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18907 std::placeholders::_1))) +
18909 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18912 CandidateVFs.clear();
18914 CandidateVFs.push_back(Limit);
18915 if (VF > MaxTotalNum || VF >= StoresLimit)
18917 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18919 P.first = std::max(
P.second,
P.first);
18923 CandidateVFs.push_back(VF);
18970 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
18972 Stores[
Set.first]->getValueOperand()->getType(),
18973 Stores[
Set.first]->getPointerOperand(),
18974 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
18978 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
18979 if (It ==
Set.second.end()) {
18980 Set.second.emplace(
Idx, *Diff);
18984 TryToVectorize(
Set.second);
18985 unsigned ItIdx = It->first;
18986 int ItDist = It->second;
18987 StoreIndexToDistSet PrevSet;
18988 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
18989 [&](
const std::pair<unsigned, int> &Pair) {
18990 return Pair.first > ItIdx;
18992 Set.second.clear();
18994 Set.second.emplace(
Idx, 0);
18997 unsigned StartIdx = ItIdx + 1;
19002 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19004 if (VectorizedStores.
contains(Stores[Pair.first]))
19006 unsigned BI = Pair.first - StartIdx;
19007 UsedStores.set(BI);
19008 Dists[BI] = Pair.second - ItDist;
19010 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19011 unsigned BI =
I - StartIdx;
19012 if (UsedStores.test(BI))
19013 Set.second.emplace(
I, Dists[BI]);
19017 auto &Res = SortedStores.emplace_back();
19019 Res.second.emplace(
Idx, 0);
19021 Type *PrevValTy =
nullptr;
19023 if (
R.isDeleted(SI))
19026 PrevValTy =
SI->getValueOperand()->getType();
19028 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19029 for (
auto &Set : SortedStores)
19030 TryToVectorize(
Set.second);
19031 SortedStores.clear();
19032 PrevValTy =
SI->getValueOperand()->getType();
19034 FillStoresSet(
I, SI);
19038 for (
auto &Set : SortedStores)
19039 TryToVectorize(
Set.second);
19044void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19055 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19056 if (!
SI->isSimple())
19066 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19067 if (
GEP->getNumIndices() != 1)
19070 if (isa<Constant>(
Idx))
19074 if (
GEP->getType()->isVectorTy())
19086 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19087 << VL.
size() <<
".\n");
19098 for (
Value *V : VL) {
19099 Type *Ty =
V->getType();
19103 R.getORE()->emit([&]() {
19104 std::string TypeStr;
19108 <<
"Cannot SLP vectorize list: type "
19109 << TypeStr +
" is unsupported by vectorizer";
19115 unsigned Sz =
R.getVectorElementSize(I0);
19116 unsigned MinVF =
R.getMinVF(Sz);
19117 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
19118 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19120 R.getORE()->emit([&]() {
19122 <<
"Cannot SLP vectorize list: vectorization factor "
19123 <<
"less than 2 is not supported";
19128 bool Changed =
false;
19129 bool CandidateFound =
false;
19133 unsigned NextInst = 0, MaxInst = VL.size();
19134 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
19141 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19142 unsigned ActualVF = std::min(MaxInst -
I, VF);
19147 if (MaxVFOnly && ActualVF < MaxVF)
19149 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
19154 for (
Value *V : VL.drop_front(
I)) {
19157 if (
auto *Inst = dyn_cast<Instruction>(V);
19158 !Inst || !
R.isDeleted(Inst)) {
19161 if (
Idx == ActualVF)
19166 if (
Idx != ActualVF)
19169 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19173 if (
R.isTreeTinyAndNotFullyVectorizable())
19175 R.reorderTopToBottom();
19176 R.reorderBottomToTop(
19177 !isa<InsertElementInst>(Ops.
front()) &&
19178 !
R.doesRootHaveInTreeUses());
19179 R.transformNodes();
19180 R.buildExternalUses();
19182 R.computeMinimumValueSizes();
19184 CandidateFound =
true;
19185 MinCost = std::min(MinCost,
Cost);
19188 <<
" for VF=" << ActualVF <<
"\n");
19192 cast<Instruction>(Ops[0]))
19193 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19194 <<
" and with tree size "
19195 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19206 if (!Changed && CandidateFound) {
19207 R.getORE()->emit([&]() {
19209 <<
"List vectorization was possible but not beneficial with cost "
19210 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19213 }
else if (!Changed) {
19214 R.getORE()->emit([&]() {
19216 <<
"Cannot SLP vectorize list: vectorization was impossible"
19217 <<
" with available vectorization factors";
19227 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19233 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19234 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19235 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19236 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19243 auto *
A = dyn_cast<BinaryOperator>(Op0);
19244 auto *
B = dyn_cast<BinaryOperator>(Op1);
19246 if (
A &&
B &&
B->hasOneUse()) {
19247 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19248 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19249 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19251 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19255 if (
B &&
A &&
A->hasOneUse()) {
19256 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19257 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19258 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19260 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19264 if (Candidates.
size() == 1)
19265 return tryToVectorizeList({Op0, Op1},
R);
19268 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19269 if (!BestCandidate)
19271 return tryToVectorizeList(
19272 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19306 ReductionOpsListType ReductionOps;
19316 bool IsSupportedHorRdxIdentityOp =
false;
19327 return isa<SelectInst>(
I) &&
19333 if (Kind == RecurKind::None)
19341 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19345 return I->getFastMathFlags().noNaNs();
19348 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19351 return I->isAssociative();
19360 return I->getOperand(2);
19361 return I->getOperand(
Index);
19369 case RecurKind::Or:
19375 case RecurKind::And:
19381 case RecurKind::Add:
19382 case RecurKind::Mul:
19383 case RecurKind::Xor:
19384 case RecurKind::FAdd:
19385 case RecurKind::FMul:
19388 case RecurKind::FMax:
19390 case RecurKind::FMin:
19392 case RecurKind::FMaximum:
19394 case RecurKind::FMinimum:
19396 case RecurKind::SMax:
19402 case RecurKind::SMin:
19408 case RecurKind::UMax:
19414 case RecurKind::UMin:
19429 const ReductionOpsListType &ReductionOps) {
19430 bool UseSelect = ReductionOps.size() == 2 ||
19432 (ReductionOps.size() == 1 &&
19433 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19434 assert((!UseSelect || ReductionOps.size() != 2 ||
19435 isa<SelectInst>(ReductionOps[1][0])) &&
19436 "Expected cmp + select pairs for reduction");
19439 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19453 auto *
I = dyn_cast<Instruction>(V);
19455 return RecurKind::None;
19457 return RecurKind::Add;
19459 return RecurKind::Mul;
19462 return RecurKind::And;
19465 return RecurKind::Or;
19467 return RecurKind::Xor;
19469 return RecurKind::FAdd;
19471 return RecurKind::FMul;
19474 return RecurKind::FMax;
19476 return RecurKind::FMin;
19479 return RecurKind::FMaximum;
19481 return RecurKind::FMinimum;
19487 return RecurKind::SMax;
19489 return RecurKind::SMin;
19491 return RecurKind::UMax;
19493 return RecurKind::UMin;
19495 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19517 if (!isa<ExtractElementInst>(
RHS) ||
19519 return RecurKind::None;
19521 if (!isa<ExtractElementInst>(
LHS) ||
19523 return RecurKind::None;
19525 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19526 return RecurKind::None;
19530 return RecurKind::None;
19535 return RecurKind::None;
19538 return RecurKind::SMax;
19541 return RecurKind::SMin;
19544 return RecurKind::UMax;
19547 return RecurKind::UMin;
19550 return RecurKind::None;
19554 static unsigned getFirstOperandIndex(
Instruction *
I) {
19555 return isCmpSelMinMax(
I) ? 1 : 0;
19561 return isCmpSelMinMax(
I) ? 3 : 2;
19567 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19568 auto *Sel = cast<SelectInst>(
I);
19569 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19570 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19572 return I->getParent() == BB;
19576 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19577 if (IsCmpSelMinMax) {
19580 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19581 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19582 return I->hasNUses(2);
19586 return I->hasOneUse();
19591 if (isCmpSelMinMax(
I))
19592 ReductionOps.assign(2, ReductionOpsType());
19594 ReductionOps.assign(1, ReductionOpsType());
19599 if (isCmpSelMinMax(
I)) {
19600 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19601 ReductionOps[1].emplace_back(
I);
19603 ReductionOps[0].emplace_back(
I);
19608 int Sz = Data.size();
19609 auto *
I = dyn_cast<Instruction>(Data.front());
19610 return Sz > 1 ||
isConstant(Data.front()) ||
19621 RdxKind = HorizontalReduction::getRdxKind(Root);
19622 if (!isVectorizable(RdxKind, Root))
19633 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19634 if (!Sel->getCondition()->hasOneUse())
19637 ReductionRoot = Root;
19642 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19644 1, std::make_pair(Root, 0));
19652 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19653 getNumberOfOperands(TreeN)))) {
19654 Value *EdgeVal = getRdxOperand(TreeN,
I);
19655 ReducedValsToOps[EdgeVal].push_back(TreeN);
19656 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19663 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19664 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19665 !isVectorizable(RdxKind, EdgeInst) ||
19666 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19667 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19668 PossibleReducedVals.push_back(EdgeVal);
19671 ReductionOps.push_back(EdgeInst);
19682 PossibleReducedVals;
19683 initReductionOps(Root);
19687 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19691 if (!LoadKeyUsed.
insert(Key).second) {
19692 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19693 if (LIt != LoadsMap.
end()) {
19694 for (
LoadInst *RLI : LIt->second) {
19700 for (
LoadInst *RLI : LIt->second) {
19707 if (LIt->second.size() > 2) {
19709 hash_value(LIt->second.back()->getPointerOperand());
19715 .first->second.push_back(LI);
19719 while (!Worklist.empty()) {
19720 auto [TreeN, Level] = Worklist.pop_back_val();
19723 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19724 addReductionOps(TreeN);
19727 for (
Value *V : PossibleRedVals) {
19731 ++PossibleReducedVals[
Key][
Idx]
19732 .
insert(std::make_pair(V, 0))
19736 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19738 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19741 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19742 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19744 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19747 auto RedValsVect = It->second.takeVector();
19749 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19750 PossibleRedValsVect.
back().append(Data.second, Data.first);
19752 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19753 return P1.size() > P2.size();
19758 (!isGoodForReduction(Data) &&
19759 (!isa<LoadInst>(Data.front()) ||
19760 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19762 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19764 cast<LoadInst>(ReducedVals[NewIdx].front())
19766 NewIdx = ReducedVals.
size();
19769 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19784 constexpr unsigned RegMaxNumber = 4;
19785 constexpr unsigned RedValsMaxNumber = 128;
19789 if (
unsigned NumReducedVals = std::accumulate(
19790 ReducedVals.
begin(), ReducedVals.
end(), 0,
19792 if (!isGoodForReduction(Vals))
19794 return Num + Vals.size();
19796 NumReducedVals < ReductionLimit &&
19800 for (ReductionOpsType &RdxOps : ReductionOps)
19801 for (
Value *RdxOp : RdxOps)
19802 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19813 ReducedVals.
front().size());
19817 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19818 assert(isa<SelectInst>(RdxRootInst) &&
19819 "Expected min/max reduction to have select root instruction");
19820 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19821 assert(isa<Instruction>(ScalarCond) &&
19822 "Expected min/max reduction to have compare condition");
19823 return cast<Instruction>(ScalarCond);
19826 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19827 return isBoolLogicOp(cast<Instruction>(V));
19830 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19831 if (VectorizedTree) {
19834 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19835 if (AnyBoolLogicOp) {
19836 auto It = ReducedValsToOps.
find(VectorizedTree);
19837 auto It1 = ReducedValsToOps.
find(Res);
19838 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19840 (It != ReducedValsToOps.
end() &&
19842 return isBoolLogicOp(I) &&
19843 getRdxOperand(I, 0) == VectorizedTree;
19847 (It1 != ReducedValsToOps.
end() &&
19849 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19853 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19857 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19864 ReductionOps.front().size());
19865 for (ReductionOpsType &RdxOps : ReductionOps)
19866 for (
Value *RdxOp : RdxOps) {
19869 IgnoreList.insert(RdxOp);
19874 for (
Value *U : IgnoreList)
19875 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19876 RdxFMF &= FPMO->getFastMathFlags();
19877 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19882 for (
Value *V : Candidates)
19883 TrackedVals.try_emplace(V, V);
19886 Value *
V) ->
unsigned & {
19887 auto *It = MV.
find(V);
19888 assert(It != MV.
end() &&
"Unable to find given key.");
19897 bool CheckForReusedReductionOps =
false;
19902 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19904 InstructionsState S = States[
I];
19908 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19909 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19914 auto *Inst = dyn_cast<Instruction>(RdxVal);
19916 (!S || !S.isOpcodeOrAlt(Inst))) ||
19920 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19922 bool ShuffledExtracts =
false;
19924 if (S && S.getOpcode() == Instruction::ExtractElement &&
19925 !S.isAltShuffle() &&
I + 1 <
E) {
19927 for (
Value *RV : ReducedVals[
I + 1]) {
19928 Value *RdxVal = TrackedVals.at(RV);
19932 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19935 CommonCandidates.push_back(RdxVal);
19936 TrackedToOrig.try_emplace(RdxVal, RV);
19941 Candidates.
swap(CommonCandidates);
19942 ShuffledExtracts =
true;
19949 Value *OrigV = TrackedToOrig.at(Candidates.
front());
19950 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19952 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
19953 Value *OrigV = TrackedToOrig.at(VC);
19954 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19955 if (
auto *ResI = dyn_cast<Instruction>(Res))
19956 V.analyzedReductionRoot(ResI);
19958 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19962 unsigned NumReducedVals = Candidates.
size();
19963 if (NumReducedVals < ReductionLimit &&
19964 (NumReducedVals < 2 || !
isSplat(Candidates)))
19969 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
19970 RdxKind != RecurKind::FMul &&
19971 RdxKind != RecurKind::FMulAdd;
19974 if (IsSupportedHorRdxIdentityOp)
19975 for (
Value *V : Candidates) {
19976 Value *OrigV = TrackedToOrig.at(V);
19977 ++SameValuesCounter.
try_emplace(OrigV).first->second;
19989 bool SameScaleFactor =
false;
19990 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
19991 SameValuesCounter.
size() != Candidates.size();
19993 if (OptReusedScalars) {
19995 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
19996 RdxKind == RecurKind::Xor) &&
19998 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
19999 return P.second == SameValuesCounter.
front().second;
20001 Candidates.resize(SameValuesCounter.
size());
20002 transform(SameValuesCounter, Candidates.begin(),
20003 [&](
const auto &
P) { return TrackedVals.at(P.first); });
20004 NumReducedVals = Candidates.size();
20006 if (NumReducedVals == 1) {
20007 Value *OrigV = TrackedToOrig.at(Candidates.front());
20008 unsigned Cnt = At(SameValuesCounter, OrigV);
20010 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20011 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20012 VectorizedVals.try_emplace(OrigV, Cnt);
20013 ExternallyUsedValues.
insert(OrigV);
20018 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20019 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20020 const unsigned MaxElts = std::clamp<unsigned>(
20022 RegMaxNumber * RedValsMaxNumber);
20024 unsigned ReduxWidth = NumReducedVals;
20025 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20026 unsigned NumParts, NumRegs;
20027 Type *ScalarTy = Candidates.front()->getType();
20034 while (NumParts > NumRegs) {
20035 ReduxWidth =
bit_floor(ReduxWidth - 1);
20041 if (NumParts > NumRegs / 2)
20046 ReduxWidth = GetVectorFactor(ReduxWidth);
20047 ReduxWidth = std::min(ReduxWidth, MaxElts);
20049 unsigned Start = 0;
20050 unsigned Pos = Start;
20052 unsigned PrevReduxWidth = ReduxWidth;
20053 bool CheckForReusedReductionOpsLocal =
false;
20054 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20055 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20056 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20059 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20062 if (Pos < NumReducedVals - ReduxWidth + 1)
20063 return IsAnyRedOpGathered;
20066 if (ReduxWidth > 1)
20067 ReduxWidth = GetVectorFactor(ReduxWidth);
20068 return IsAnyRedOpGathered;
20070 bool AnyVectorized =
false;
20072 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20073 ReduxWidth >= ReductionLimit) {
20076 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20078 CheckForReusedReductionOps =
true;
20081 PrevReduxWidth = ReduxWidth;
20084 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20087 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20089 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20091 V.areAnalyzedReductionVals(VL)) {
20092 (void)AdjustReducedVals(
true);
20098 auto *RedValI = dyn_cast<Instruction>(RedVal);
20101 return V.isDeleted(RedValI);
20104 V.buildTree(VL, IgnoreList);
20105 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20106 if (!AdjustReducedVals())
20107 V.analyzedReductionVals(VL);
20110 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20111 if (!AdjustReducedVals())
20112 V.analyzedReductionVals(VL);
20115 V.reorderTopToBottom();
20117 V.reorderBottomToTop(
true);
20121 ExternallyUsedValues);
20125 LocalExternallyUsedValues.insert(ReductionRoot);
20126 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20127 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20129 for (
Value *V : ReducedVals[Cnt])
20130 if (isa<Instruction>(V))
20131 LocalExternallyUsedValues.insert(TrackedVals[V]);
20133 if (!IsSupportedHorRdxIdentityOp) {
20136 "Reused values counter map is not empty");
20137 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20138 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20140 Value *
V = Candidates[Cnt];
20141 Value *OrigV = TrackedToOrig.at(V);
20142 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20145 V.transformNodes();
20149 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20150 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20152 Value *RdxVal = Candidates[Cnt];
20153 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20154 RdxVal = It->second;
20155 if (!Visited.
insert(RdxVal).second)
20159 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20160 LocalExternallyUsedValues.insert(RdxVal);
20163 Value *OrigV = TrackedToOrig.at(RdxVal);
20165 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20166 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20167 LocalExternallyUsedValues.insert(RdxVal);
20170 if (!IsSupportedHorRdxIdentityOp)
20171 SameValuesCounter.
clear();
20172 for (
Value *RdxVal : VL)
20173 if (RequiredExtract.
contains(RdxVal))
20174 LocalExternallyUsedValues.insert(RdxVal);
20175 V.buildExternalUses(LocalExternallyUsedValues);
20177 V.computeMinimumValueSizes();
20182 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20185 <<
" for reduction\n");
20189 V.getORE()->emit([&]() {
20191 ReducedValsToOps.
at(VL[0]).front())
20192 <<
"Vectorizing horizontal reduction is possible "
20193 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20194 <<
" and threshold "
20197 if (!AdjustReducedVals()) {
20198 V.analyzedReductionVals(VL);
20199 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20200 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20203 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20204 VF >= ReductionLimit;
20206 *
TTI, VL.front()->getType(), VF - 1)) {
20208 V.getCanonicalGraphSize() !=
V.getTreeSize())
20210 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20218 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20219 <<
Cost <<
". (HorRdx)\n");
20220 V.getORE()->emit([&]() {
20222 ReducedValsToOps.
at(VL[0]).front())
20223 <<
"Vectorized horizontal reduction with cost "
20224 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20225 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20232 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20234 if (IsCmpSelMinMax)
20235 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20238 Value *VectorizedRoot =
20239 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20242 for (
Value *RdxVal : Candidates) {
20243 Value *OrigVal = TrackedToOrig.at(RdxVal);
20244 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20245 if (TransformedRdxVal != RdxVal)
20246 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20255 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20258 if (OptReusedScalars && !SameScaleFactor) {
20259 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20260 SameValuesCounter, TrackedToOrig);
20263 Value *ReducedSubTree;
20264 Type *ScalarTy = VL.front()->getType();
20265 if (isa<FixedVectorType>(ScalarTy)) {
20270 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20288 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20291 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20294 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20295 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20296 "Expected different reduction type.");
20298 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20299 V.isSignedMinBitwidthRootNode());
20305 if (OptReusedScalars && SameScaleFactor)
20306 ReducedSubTree = emitScaleForReusedOps(
20307 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20309 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20311 for (
Value *RdxVal : VL) {
20312 Value *OrigV = TrackedToOrig.at(RdxVal);
20313 if (IsSupportedHorRdxIdentityOp) {
20314 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20317 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20318 if (!
V.isVectorized(RdxVal))
20319 RequiredExtract.
insert(RdxVal);
20323 ReduxWidth = NumReducedVals - Pos;
20324 if (ReduxWidth > 1)
20325 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20326 AnyVectorized =
true;
20328 if (OptReusedScalars && !AnyVectorized) {
20329 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20330 Value *RdxVal = TrackedVals.at(
P.first);
20331 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20332 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20333 VectorizedVals.try_emplace(
P.first,
P.second);
20338 if (VectorizedTree) {
20359 if (!AnyBoolLogicOp)
20361 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20362 getRdxOperand(RedOp1, 0) ==
LHS ||
20365 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20366 getRdxOperand(RedOp2, 0) ==
RHS ||
20371 if (
LHS != VectorizedTree)
20382 unsigned Sz = InstVals.
size();
20385 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20388 Value *RdxVal1 = InstVals[
I].second;
20389 Value *StableRdxVal1 = RdxVal1;
20390 auto It1 = TrackedVals.find(RdxVal1);
20391 if (It1 != TrackedVals.end())
20392 StableRdxVal1 = It1->second;
20393 Value *RdxVal2 = InstVals[
I + 1].second;
20394 Value *StableRdxVal2 = RdxVal2;
20395 auto It2 = TrackedVals.find(RdxVal2);
20396 if (It2 != TrackedVals.end())
20397 StableRdxVal2 = It2->second;
20401 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20403 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20404 StableRdxVal2,
"op.rdx", ReductionOps);
20405 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20408 ExtraReds[Sz / 2] = InstVals.
back();
20412 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20416 for (
Value *RdxVal : Candidates) {
20417 if (!Visited.
insert(RdxVal).second)
20419 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20426 bool InitStep =
true;
20427 while (ExtraReductions.
size() > 1) {
20429 FinalGen(ExtraReductions, InitStep);
20430 ExtraReductions.
swap(NewReds);
20433 VectorizedTree = ExtraReductions.
front().second;
20435 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20444 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20451 for (
auto *U :
Ignore->users()) {
20453 "All users must be either in the reduction ops list.");
20456 if (!
Ignore->use_empty()) {
20458 Ignore->replaceAllUsesWith(
P);
20461 V.removeInstructionsAndOperands(RdxOps);
20463 }
else if (!CheckForReusedReductionOps) {
20464 for (ReductionOpsType &RdxOps : ReductionOps)
20465 for (
Value *RdxOp : RdxOps)
20466 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20468 return VectorizedTree;
20478 Type *ScalarTy = ReducedVals.
front()->getType();
20479 unsigned ReduxWidth = ReducedVals.
size();
20488 int Cnt = ReducedVals.
size();
20489 for (
Value *RdxVal : ReducedVals) {
20494 Cost += GenCostFn();
20499 auto *RdxOp = cast<Instruction>(U);
20500 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20508 Cost += ScalarCost;
20510 Cost += GenCostFn();
20515 case RecurKind::Add:
20516 case RecurKind::Mul:
20517 case RecurKind::Or:
20518 case RecurKind::And:
20519 case RecurKind::Xor:
20520 case RecurKind::FAdd:
20521 case RecurKind::FMul: {
20524 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20527 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20539 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20540 std::make_pair(RedTy,
true));
20541 if (RType == RedTy) {
20551 ScalarCost = EvaluateScalarCost([&]() {
20556 case RecurKind::FMax:
20557 case RecurKind::FMin:
20558 case RecurKind::FMaximum:
20559 case RecurKind::FMinimum:
20560 case RecurKind::SMax:
20561 case RecurKind::SMin:
20562 case RecurKind::UMax:
20563 case RecurKind::UMin: {
20567 ScalarCost = EvaluateScalarCost([&]() {
20577 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20579 <<
" (It is a splitting reduction)\n");
20580 return VectorCost - ScalarCost;
20586 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20587 assert(RdxKind != RecurKind::FMulAdd &&
20588 "A call to the llvm.fmuladd intrinsic is not handled yet");
20590 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20591 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20592 RdxKind == RecurKind::Add &&
20597 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20598 ++NumVectorInstructions;
20601 ++NumVectorInstructions;
20608 assert(IsSupportedHorRdxIdentityOp &&
20609 "The optimization of matched scalar identity horizontal reductions "
20610 "must be supported.");
20612 return VectorizedValue;
20614 case RecurKind::Add: {
20616 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20618 << VectorizedValue <<
". (HorRdx)\n");
20619 return Builder.
CreateMul(VectorizedValue, Scale);
20621 case RecurKind::Xor: {
20623 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20624 <<
". (HorRdx)\n");
20627 return VectorizedValue;
20629 case RecurKind::FAdd: {
20631 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20633 << VectorizedValue <<
". (HorRdx)\n");
20634 return Builder.
CreateFMul(VectorizedValue, Scale);
20636 case RecurKind::And:
20637 case RecurKind::Or:
20638 case RecurKind::SMax:
20639 case RecurKind::SMin:
20640 case RecurKind::UMax:
20641 case RecurKind::UMin:
20642 case RecurKind::FMax:
20643 case RecurKind::FMin:
20644 case RecurKind::FMaximum:
20645 case RecurKind::FMinimum:
20647 return VectorizedValue;
20648 case RecurKind::Mul:
20649 case RecurKind::FMul:
20650 case RecurKind::FMulAdd:
20651 case RecurKind::IAnyOf:
20652 case RecurKind::FAnyOf:
20653 case RecurKind::IFindLastIV:
20654 case RecurKind::FFindLastIV:
20655 case RecurKind::None:
20667 assert(IsSupportedHorRdxIdentityOp &&
20668 "The optimization of matched scalar identity horizontal reductions "
20669 "must be supported.");
20671 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20672 if (VTy->getElementType() != VL.
front()->getType()) {
20676 R.isSignedMinBitwidthRootNode());
20679 case RecurKind::Add: {
20682 for (
Value *V : VL) {
20683 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20684 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20688 << VectorizedValue <<
". (HorRdx)\n");
20689 return Builder.
CreateMul(VectorizedValue, Scale);
20691 case RecurKind::And:
20692 case RecurKind::Or:
20695 <<
". (HorRdx)\n");
20696 return VectorizedValue;
20697 case RecurKind::SMax:
20698 case RecurKind::SMin:
20699 case RecurKind::UMax:
20700 case RecurKind::UMin:
20701 case RecurKind::FMax:
20702 case RecurKind::FMin:
20703 case RecurKind::FMaximum:
20704 case RecurKind::FMinimum:
20707 <<
". (HorRdx)\n");
20708 return VectorizedValue;
20709 case RecurKind::Xor: {
20715 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20717 std::iota(
Mask.begin(),
Mask.end(), 0);
20718 bool NeedShuffle =
false;
20719 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20721 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20722 if (Cnt % 2 == 0) {
20724 NeedShuffle =
true;
20730 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20734 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20735 return VectorizedValue;
20737 case RecurKind::FAdd: {
20740 for (
Value *V : VL) {
20741 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20742 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20745 return Builder.
CreateFMul(VectorizedValue, Scale);
20747 case RecurKind::Mul:
20748 case RecurKind::FMul:
20749 case RecurKind::FMulAdd:
20750 case RecurKind::IAnyOf:
20751 case RecurKind::FAnyOf:
20752 case RecurKind::IFindLastIV:
20753 case RecurKind::FFindLastIV:
20754 case RecurKind::None:
20764 return HorizontalReduction::getRdxKind(V);
20767 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20768 return cast<FixedVectorType>(IE->getType())->getNumElements();
20770 unsigned AggregateSize = 1;
20771 auto *
IV = cast<InsertValueInst>(InsertInst);
20772 Type *CurrentType =
IV->getType();
20774 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20775 for (
auto *Elt : ST->elements())
20776 if (Elt != ST->getElementType(0))
20777 return std::nullopt;
20778 AggregateSize *= ST->getNumElements();
20779 CurrentType = ST->getElementType(0);
20780 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20781 AggregateSize *= AT->getNumElements();
20782 CurrentType = AT->getElementType();
20783 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20784 AggregateSize *= VT->getNumElements();
20785 return AggregateSize;
20787 return AggregateSize;
20789 return std::nullopt;
20798 unsigned OperandOffset,
const BoUpSLP &R) {
20801 std::optional<unsigned> OperandIndex =
20803 if (!OperandIndex || R.isDeleted(LastInsertInst))
20805 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20807 BuildVectorOpds, InsertElts, *OperandIndex, R);
20810 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20811 InsertElts[*OperandIndex] = LastInsertInst;
20813 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20814 }
while (LastInsertInst !=
nullptr &&
20815 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20839 assert((isa<InsertElementInst>(LastInsertInst) ||
20840 isa<InsertValueInst>(LastInsertInst)) &&
20841 "Expected insertelement or insertvalue instruction!");
20844 "Expected empty result vectors!");
20847 if (!AggregateSize)
20849 BuildVectorOpds.
resize(*AggregateSize);
20850 InsertElts.
resize(*AggregateSize);
20856 if (BuildVectorOpds.
size() >= 2)
20874 auto DominatedReduxValue = [&](
Value *R) {
20875 return isa<Instruction>(R) &&
20876 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20882 if (
P->getIncomingBlock(0) == ParentBB) {
20883 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20884 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20885 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20888 if (Rdx && DominatedReduxValue(Rdx))
20901 if (
P->getIncomingBlock(0) == BBLatch) {
20902 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20903 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20904 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20907 if (Rdx && DominatedReduxValue(Rdx))
20941 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20942 isa<IntrinsicInst>(Root)) &&
20943 "Expected binop, select, or intrinsic for reduction matching");
20945 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20947 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20949 return dyn_cast<Instruction>(
RHS);
20951 return dyn_cast<Instruction>(
LHS);
20958 Value *Op0 =
nullptr;
20959 Value *Op1 =
nullptr;
20962 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20968 Value *B0 =
nullptr, *B1 =
nullptr;
20973bool SLPVectorizerPass::vectorizeHorReduction(
20978 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
20980 if (Root->
getParent() != BB || isa<PHINode>(Root))
20984 auto SelectRoot = [&]() {
21003 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21004 Stack.emplace(SelectRoot(), 0);
21008 if (
R.isAnalyzedReductionRoot(Inst))
21013 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21015 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21017 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21018 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21025 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21030 while (!
Stack.empty()) {
21033 std::tie(Inst, Level) =
Stack.front();
21038 if (
R.isDeleted(Inst))
21040 if (
Value *VectorizedV = TryToReduce(Inst)) {
21042 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21044 Stack.emplace(
I, Level);
21047 if (
R.isDeleted(Inst))
21051 if (!TryAppendToPostponedInsts(Inst)) {
21062 if (VisitedInstrs.
insert(
Op).second)
21063 if (
auto *
I = dyn_cast<Instruction>(
Op))
21066 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21067 !
R.isDeleted(
I) &&
I->getParent() == BB)
21068 Stack.emplace(
I, Level);
21076 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21077 Res |= tryToVectorize(PostponedInsts, R);
21084 for (
Value *V : Insts)
21085 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21086 Res |= tryToVectorize(Inst, R);
21090bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21093 if (!
R.canMapToVector(IVI->
getType()))
21101 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21102 R.getORE()->emit([&]() {
21104 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21105 "trying reduction first.";
21109 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21111 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21121 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21125 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21126 R.getORE()->emit([&]() {
21128 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21129 "trying reduction first.";
21133 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21134 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21137template <
typename T>
21142 bool MaxVFOnly,
BoUpSLP &R) {
21143 bool Changed =
false;
21154 auto *
I = dyn_cast<Instruction>(*IncIt);
21155 if (!
I || R.isDeleted(
I)) {
21159 auto *SameTypeIt = IncIt;
21160 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21161 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21162 AreCompatible(*SameTypeIt, *IncIt))) {
21163 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21165 if (
I && !R.isDeleted(
I))
21170 unsigned NumElts = VL.
size();
21171 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21172 << NumElts <<
")\n");
21182 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21185 VL.
swap(Candidates);
21186 Candidates.
clear();
21188 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21194 auto GetMinNumElements = [&R](
Value *V) {
21195 unsigned EltSize = R.getVectorElementSize(V);
21196 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21198 if (NumElts < GetMinNumElements(*IncIt) &&
21199 (Candidates.
empty() ||
21200 Candidates.
front()->getType() == (*IncIt)->getType())) {
21202 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21208 if (Candidates.
size() > 1 &&
21209 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21210 if (TryToVectorizeHelper(Candidates,
false)) {
21213 }
else if (MaxVFOnly) {
21216 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21218 auto *
I = dyn_cast<Instruction>(*It);
21219 if (!
I || R.isDeleted(
I)) {
21223 auto *SameTypeIt = It;
21224 while (SameTypeIt !=
End &&
21225 (!isa<Instruction>(*SameTypeIt) ||
21226 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21227 AreCompatible(*SameTypeIt, *It))) {
21228 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21230 if (
I && !R.isDeleted(
I))
21233 unsigned NumElts = VL.
size();
21234 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21240 Candidates.
clear();
21244 IncIt = SameTypeIt;
21256template <
bool IsCompatibility>
21261 "Expected valid element types only.");
21263 return IsCompatibility;
21264 auto *CI1 = cast<CmpInst>(V);
21265 auto *CI2 = cast<CmpInst>(V2);
21266 if (CI1->getOperand(0)->getType()->getTypeID() <
21268 return !IsCompatibility;
21269 if (CI1->getOperand(0)->getType()->getTypeID() >
21272 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21274 return !IsCompatibility;
21275 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21284 if (BasePred1 < BasePred2)
21285 return !IsCompatibility;
21286 if (BasePred1 > BasePred2)
21289 bool CI1Preds = Pred1 == BasePred1;
21290 bool CI2Preds = Pred2 == BasePred1;
21291 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21292 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21293 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21297 return !IsCompatibility;
21300 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21301 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21302 if (IsCompatibility) {
21303 if (I1->getParent() != I2->getParent())
21310 return NodeI2 !=
nullptr;
21313 assert((NodeI1 == NodeI2) ==
21315 "Different nodes should have different DFS numbers");
21316 if (NodeI1 != NodeI2)
21320 if (S && (IsCompatibility || !S.isAltShuffle()))
21322 if (IsCompatibility)
21324 if (I1->getOpcode() != I2->getOpcode())
21325 return I1->getOpcode() < I2->getOpcode();
21328 return IsCompatibility;
21331template <
typename ItT>
21334 bool Changed =
false;
21337 if (
R.isDeleted(
I))
21340 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21341 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21342 if (
R.isDeleted(
I))
21348 if (
R.isDeleted(
I))
21350 Changed |= tryToVectorize(
I, R);
21357 return compareCmp<false>(V, V2, *TLI, *DT);
21360 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21363 return compareCmp<true>(V1, V2, *TLI, *DT);
21370 if (Vals.
size() <= 1)
21372 Changed |= tryToVectorizeSequence<Value>(
21373 Vals, CompareSorter, AreCompatibleCompares,
21376 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21378 auto *Select = dyn_cast<SelectInst>(U);
21380 Select->getParent() != cast<Instruction>(V)->getParent();
21383 if (ArePossiblyReducedInOtherBlock)
21385 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21391bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21393 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21394 "This function only accepts Insert instructions");
21395 bool OpsChanged =
false;
21397 for (
auto *
I :
reverse(Instructions)) {
21399 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21401 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21403 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21404 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21406 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21409 if (
R.isDeleted(
I))
21411 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21412 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21415 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21417 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21418 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21419 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21424 OpsChanged |= tryToVectorize(PostponedInsts, R);
21431 bool Changed =
false;
21438 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21441 "Expected vectorizable types only.");
21449 V2->getType()->getScalarSizeInBits())
21452 V2->getType()->getScalarSizeInBits())
21456 if (Opcodes1.
size() < Opcodes2.
size())
21458 if (Opcodes1.
size() > Opcodes2.
size())
21460 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21463 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21464 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21469 return NodeI2 !=
nullptr;
21472 assert((NodeI1 == NodeI2) ==
21474 "Different nodes should have different DFS numbers");
21475 if (NodeI1 != NodeI2)
21478 if (S && !S.isAltShuffle())
21480 return I1->getOpcode() < I2->getOpcode();
21489 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21490 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21498 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21499 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21503 auto ValID1 = Opcodes1[
I]->getValueID();
21504 auto ValID2 = Opcodes2[
I]->getValueID();
21505 if (ValID1 == ValID2)
21507 if (ValID1 < ValID2)
21509 if (ValID1 > ValID2)
21518 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21522 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21525 if (V1->getType() !=
V2->getType())
21529 if (Opcodes1.
size() != Opcodes2.
size())
21531 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21533 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21535 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21536 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21537 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21539 if (
I1->getParent() != I2->getParent())
21545 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21547 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21553 bool HaveVectorizedPhiNodes =
false;
21558 auto *
P = dyn_cast<PHINode>(&
I);
21564 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21577 if (!Opcodes.
empty())
21581 while (!Nodes.
empty()) {
21582 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21585 for (
Value *V :
PHI->incoming_values()) {
21586 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21587 Nodes.push_back(PHI1);
21595 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21596 Incoming, PHICompare, AreCompatiblePHIs,
21598 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21601 Changed |= HaveVectorizedPhiNodes;
21602 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21603 auto *
PHI = dyn_cast<PHINode>(
P.first);
21604 return !
PHI ||
R.isDeleted(
PHI);
21606 PHIToOpcodes.
clear();
21608 }
while (HaveVectorizedPhiNodes);
21610 VisitedInstrs.
clear();
21612 InstSetVector PostProcessInserts;
21616 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21617 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21618 if (VectorizeCmps) {
21619 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21620 PostProcessCmps.
clear();
21622 PostProcessInserts.clear();
21627 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21628 return PostProcessCmps.
contains(Cmp);
21629 return isa<InsertElementInst, InsertValueInst>(
I) &&
21630 PostProcessInserts.contains(
I);
21636 return I->use_empty() &&
21637 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21642 if (isa<ScalableVectorType>(It->getType()))
21646 if (
R.isDeleted(&*It))
21649 if (!VisitedInstrs.
insert(&*It).second) {
21650 if (HasNoUsers(&*It) &&
21651 VectorizeInsertsAndCmps(It->isTerminator())) {
21661 if (isa<DbgInfoIntrinsic>(It))
21665 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21667 if (
P->getNumIncomingValues() == 2) {
21670 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21679 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21684 if (BB ==
P->getIncomingBlock(
I) ||
21690 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21691 PI && !IsInPostProcessInstrs(PI)) {
21693 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21695 if (Res &&
R.isDeleted(
P)) {
21705 if (HasNoUsers(&*It)) {
21706 bool OpsChanged =
false;
21707 auto *
SI = dyn_cast<StoreInst>(It);
21717 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21718 SI->getValueOperand()->hasOneUse();
21720 if (TryToVectorizeRoot) {
21721 for (
auto *V : It->operand_values()) {
21724 if (
auto *VI = dyn_cast<Instruction>(V);
21725 VI && !IsInPostProcessInstrs(VI))
21727 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21734 VectorizeInsertsAndCmps(It->isTerminator());
21745 if (isa<InsertElementInst, InsertValueInst>(It))
21746 PostProcessInserts.insert(&*It);
21747 else if (isa<CmpInst>(It))
21748 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21755 auto Changed =
false;
21756 for (
auto &Entry : GEPs) {
21759 if (
Entry.second.size() < 2)
21762 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21763 <<
Entry.second.size() <<
".\n");
21771 return !R.isDeleted(GEP);
21773 if (It ==
Entry.second.end())
21775 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21776 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21777 if (MaxVecRegSize < EltSize)
21780 unsigned MaxElts = MaxVecRegSize / EltSize;
21781 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21782 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21795 Candidates.remove_if([&R](
Value *
I) {
21796 return R.isDeleted(cast<Instruction>(
I)) ||
21797 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21805 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21806 auto *GEPI = GEPList[
I];
21807 if (!Candidates.count(GEPI))
21810 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21811 auto *GEPJ = GEPList[J];
21813 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21814 Candidates.remove(GEPI);
21815 Candidates.remove(GEPJ);
21816 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21817 Candidates.remove(GEPJ);
21824 if (Candidates.
size() < 2)
21831 auto BundleIndex = 0
u;
21832 for (
auto *V : Candidates) {
21833 auto *
GEP = cast<GetElementPtrInst>(V);
21834 auto *GEPIdx =
GEP->idx_begin()->get();
21835 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21836 Bundle[BundleIndex++] = GEPIdx;
21848 Changed |= tryToVectorizeList(Bundle, R);
21854bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21855 bool Changed =
false;
21860 if (
V->getValueOperand()->getType()->getTypeID() <
21861 V2->getValueOperand()->getType()->getTypeID())
21863 if (
V->getValueOperand()->getType()->getTypeID() >
21864 V2->getValueOperand()->getType()->getTypeID())
21866 if (
V->getPointerOperandType()->getTypeID() <
21867 V2->getPointerOperandType()->getTypeID())
21869 if (
V->getPointerOperandType()->getTypeID() >
21870 V2->getPointerOperandType()->getTypeID())
21872 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21873 V2->getValueOperand()->getType()->getScalarSizeInBits())
21875 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21876 V2->getValueOperand()->getType()->getScalarSizeInBits())
21879 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21880 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21884 DT->
getNode(I2->getParent());
21885 assert(NodeI1 &&
"Should only process reachable instructions");
21886 assert(NodeI2 &&
"Should only process reachable instructions");
21887 assert((NodeI1 == NodeI2) ==
21889 "Different nodes should have different DFS numbers");
21890 if (NodeI1 != NodeI2)
21892 return I1->getOpcode() < I2->getOpcode();
21894 return V->getValueOperand()->getValueID() <
21895 V2->getValueOperand()->getValueID();
21907 isa<UndefValue>(
V2->getValueOperand()))
21910 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21911 if (
I1->getParent() != I2->getParent())
21916 isa<Constant>(
V2->getValueOperand()))
21919 V2->getValueOperand()->getValueID();
21924 for (
auto &Pair : Stores) {
21925 if (Pair.second.size() < 2)
21929 << Pair.second.size() <<
".\n");
21938 Pair.second.rend());
21939 Changed |= tryToVectorizeSequence<StoreInst>(
21940 ReversedStores, StoreSorter, AreCompatibleStores,
21942 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.