73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
108#define SV_NAME "slp-vectorizer"
109#define DEBUG_TYPE "SLP"
111STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 "Controls which SLP graphs should be vectorized.");
118 cl::desc(
"Run the SLP vectorization passes"));
122 cl::desc(
"Enable vectorization for wider vector utilization"));
126 cl::desc(
"Only vectorize if you gain more than this "
131 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
132 "heuristics and makes vectorization decision via cost modeling."));
136 cl::desc(
"Attempt to vectorize horizontal reductions"));
141 "Attempt to vectorize horizontal reductions feeding into a store"));
145 cl::desc(
"Attempt to vectorize for this register size in bits"));
149 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
157 cl::desc(
"Limit the size of the SLP scheduling region per block"));
161 cl::desc(
"Attempt to vectorize for this register size in bits"));
165 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
169 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
175 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
184 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
188 cl::desc(
"The minimum number of loads, which should be considered strided, "
189 "if the stride is > 1 or is runtime value"));
193 cl::desc(
"The maximum stride, considered to be profitable."));
197 cl::desc(
"Display the SLP trees with Graphviz"));
201 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
232 if (
SLPReVec && isa<FixedVectorType>(Ty))
234 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
243 if (
auto *SI = dyn_cast<StoreInst>(V))
244 return SI->getValueOperand()->getType();
245 if (
auto *CI = dyn_cast<CmpInst>(V))
246 return CI->getOperand(0)->getType();
247 if (
auto *IE = dyn_cast<InsertElementInst>(V))
248 return IE->getOperand(1)->getType();
254 assert(!isa<ScalableVectorType>(Ty) &&
255 "ScalableVectorType is not supported.");
256 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
257 return VecTy->getNumElements();
271 Type *Ty,
unsigned Sz) {
276 if (NumParts == 0 || NumParts >= Sz)
291 if (NumParts == 0 || NumParts >= Sz)
296 return (Sz / RegVF) * RegVF;
306 for (
unsigned I : seq<unsigned>(Mask.size()))
308 I * VecTyNumElements, VecTyNumElements)))
310 : Mask[
I] * VecTyNumElements + J;
341 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
343 auto *SV = cast<ShuffleVectorInst>(VL.
front());
344 unsigned SVNumElements =
345 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
346 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
347 if (SVNumElements % ShuffleMaskSize != 0)
349 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
350 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
352 unsigned NumGroup = 0;
353 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
354 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
355 Value *Src = SV->getOperand(0);
359 auto *SV = cast<ShuffleVectorInst>(V);
361 if (SV->getOperand(0) != Src)
364 if (!SV->isExtractSubvectorMask(Index))
366 ExpectedIndex.
set(Index / ShuffleMaskSize);
370 if (!ExpectedIndex.
all())
374 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
392 auto *SV = cast<ShuffleVectorInst>(VL.
front());
393 unsigned SVNumElements =
394 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 unsigned AccumulateLength = 0;
397 for (
Value *V : VL) {
398 auto *SV = cast<ShuffleVectorInst>(V);
399 for (
int M : SV->getShuffleMask())
401 : AccumulateLength + M);
402 AccumulateLength += SVNumElements;
410 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
417 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
418 !isa<ExtractValueInst, UndefValue>(V))
420 auto *
I = dyn_cast<Instruction>(V);
421 if (!
I || isa<ExtractValueInst>(
I))
423 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
425 if (isa<ExtractElementInst>(
I))
427 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
443 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
452 OS <<
"Idx: " <<
Idx <<
", ";
453 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
461 auto *It =
find_if(VL, IsaPred<Instruction>);
470 if (isa<PoisonValue>(V))
472 auto *
II = dyn_cast<Instruction>(V);
476 if (BB !=
II->getParent())
493 Value *FirstNonUndef =
nullptr;
494 for (
Value *V : VL) {
495 if (isa<UndefValue>(V))
497 if (!FirstNonUndef) {
501 if (V != FirstNonUndef)
504 return FirstNonUndef !=
nullptr;
509 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
510 return Cmp->isCommutative();
511 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
512 return BO->isCommutative() ||
513 (BO->getOpcode() == Instruction::Sub &&
520 if (match(U.getUser(),
521 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
522 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
526 return match(U.getUser(),
527 m_Intrinsic<Intrinsic::abs>(
528 m_Specific(U.get()), m_ConstantInt(Flag))) &&
529 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
532 (BO->getOpcode() == Instruction::FSub &&
535 return match(U.getUser(),
536 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 return I->isCommutative();
544 static_assert(std::is_same_v<T, InsertElementInst> ||
545 std::is_same_v<T, ExtractElementInst>,
548 if (
const auto *IE = dyn_cast<T>(Inst)) {
549 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
552 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
555 if (CI->getValue().uge(VT->getNumElements()))
557 Index *= VT->getNumElements();
558 Index += CI->getZExtValue();
569 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
571 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
576 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
580 Type *CurrentType =
IV->getType();
581 for (
unsigned I :
IV->indices()) {
582 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
583 Index *= ST->getNumElements();
584 CurrentType = ST->getElementType(
I);
585 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
586 Index *= AT->getNumElements();
587 CurrentType = AT->getElementType();
620 if (MaskArg == UseMask::UndefsAsMask)
624 if (MaskArg == UseMask::FirstArg &&
Value < VF)
625 UseMask.reset(
Value);
626 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
627 UseMask.reset(
Value - VF);
635template <
bool IsPoisonOnly = false>
639 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
642 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
645 auto *
C = dyn_cast<Constant>(V);
647 if (!UseMask.empty()) {
649 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
651 if (isa<T>(
II->getOperand(1)))
658 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
666 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
673 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
674 if (
Constant *Elem =
C->getAggregateElement(
I))
676 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
704static std::optional<TargetTransformInfo::ShuffleKind>
706 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
710 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
711 auto *EI = dyn_cast<ExtractElementInst>(V);
714 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 return std::max(S, VTy->getNumElements());
720 Value *Vec1 =
nullptr;
721 Value *Vec2 =
nullptr;
723 auto *EE = dyn_cast<ExtractElementInst>(V);
726 Value *Vec = EE->getVectorOperand();
727 if (isa<UndefValue>(Vec))
732 ShuffleMode CommonShuffleMode =
Unknown;
734 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
736 if (isa<UndefValue>(VL[
I]))
738 auto *EI = cast<ExtractElementInst>(VL[
I]);
739 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
741 auto *Vec = EI->getVectorOperand();
743 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746 if (isa<UndefValue>(Vec)) {
749 if (isa<UndefValue>(EI->getIndexOperand()))
751 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
757 unsigned IntIdx =
Idx->getValue().getZExtValue();
764 if (!Vec1 || Vec1 == Vec) {
766 }
else if (!Vec2 || Vec2 == Vec) {
772 if (CommonShuffleMode == Permute)
776 if (Mask[
I] %
Size !=
I) {
777 CommonShuffleMode = Permute;
780 CommonShuffleMode =
Select;
783 if (CommonShuffleMode ==
Select && Vec2)
794 assert((Opcode == Instruction::ExtractElement ||
795 Opcode == Instruction::ExtractValue) &&
796 "Expected extractelement or extractvalue instruction.");
797 if (Opcode == Instruction::ExtractElement) {
798 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
801 return CI->getZExtValue();
803 auto *EI = cast<ExtractValueInst>(E);
804 if (EI->getNumIndices() != 1)
806 return *EI->idx_begin();
812class InstructionsState {
827 unsigned getAltOpcode()
const {
832 bool isAltShuffle()
const {
return AltOp != MainOp; }
835 unsigned CheckedOpcode =
I->getOpcode();
836 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
839 InstructionsState() =
delete;
841 : MainOp(MainOp), AltOp(AltOp) {}
842 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
868 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
869 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
870 BaseOp0 == Op0 || BaseOp1 == Op1 ||
881 "Assessing comparisons of different types?");
891 return (BasePred == Pred &&
893 (BasePred == SwappedPred &&
903 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
904 return InstructionsState::invalid();
906 auto *It =
find_if(VL, IsaPred<Instruction>);
908 return InstructionsState::invalid();
911 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
912 if ((VL.
size() > 2 && !isa<PHINode>(V) && InstCnt < VL.
size() / 2) ||
913 (VL.
size() == 2 && InstCnt < 2))
914 return InstructionsState::invalid();
916 bool IsCastOp = isa<CastInst>(V);
917 bool IsBinOp = isa<BinaryOperator>(V);
918 bool IsCmpOp = isa<CmpInst>(V);
921 unsigned Opcode = cast<Instruction>(V)->getOpcode();
922 unsigned AltOpcode = Opcode;
923 unsigned AltIndex = std::distance(VL.
begin(), It);
925 bool SwappedPredsCompatible = [&]() {
929 UniquePreds.
insert(BasePred);
930 UniqueNonSwappedPreds.
insert(BasePred);
931 for (
Value *V : VL) {
932 auto *
I = dyn_cast<CmpInst>(V);
938 UniqueNonSwappedPreds.
insert(CurrentPred);
939 if (!UniquePreds.
contains(CurrentPred) &&
940 !UniquePreds.
contains(SwappedCurrentPred))
941 UniquePreds.
insert(CurrentPred);
946 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
950 auto *IBase = cast<Instruction>(V);
953 if (
auto *
CallBase = dyn_cast<CallInst>(IBase)) {
957 return InstructionsState::invalid();
959 bool AnyPoison = InstCnt != VL.
size();
960 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
961 auto *
I = dyn_cast<Instruction>(VL[Cnt]);
968 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
969 return InstructionsState::invalid();
970 unsigned InstOpcode =
I->getOpcode();
971 if (IsBinOp && isa<BinaryOperator>(
I)) {
972 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
976 AltOpcode = InstOpcode;
980 }
else if (IsCastOp && isa<CastInst>(
I)) {
981 Value *Op0 = IBase->getOperand(0);
983 Value *Op1 =
I->getOperand(0);
986 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
988 if (Opcode == AltOpcode) {
991 "Cast isn't safe for alternation, logic needs to be updated!");
992 AltOpcode = InstOpcode;
997 }
else if (
auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
998 auto *BaseInst = cast<CmpInst>(V);
999 Type *Ty0 = BaseInst->getOperand(0)->getType();
1000 Type *Ty1 = Inst->getOperand(0)->getType();
1002 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1003 assert(InstOpcode == AltOpcode &&
1004 "Alternate instructions are only supported by BinaryOperator "
1012 if ((E == 2 || SwappedPredsCompatible) &&
1013 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1018 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
1022 }
else if (BasePred != CurrentPred) {
1025 "CmpInst isn't safe for alternation, logic needs to be updated!");
1030 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1031 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1034 }
else if (InstOpcode == Opcode) {
1035 assert(InstOpcode == AltOpcode &&
1036 "Alternate instructions are only supported by BinaryOperator and "
1038 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1039 if (Gep->getNumOperands() != 2 ||
1040 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
1041 return InstructionsState::invalid();
1042 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1044 return InstructionsState::invalid();
1045 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1046 auto *BaseLI = cast<LoadInst>(IBase);
1047 if (!LI->isSimple() || !BaseLI->isSimple())
1048 return InstructionsState::invalid();
1049 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1050 auto *
CallBase = cast<CallInst>(IBase);
1052 return InstructionsState::invalid();
1053 if (Call->hasOperandBundles() &&
1055 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1056 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1059 return InstructionsState::invalid();
1062 return InstructionsState::invalid();
1065 if (Mappings.
size() != BaseMappings.
size() ||
1066 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1067 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1068 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1069 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1070 Mappings.
front().Shape.Parameters !=
1071 BaseMappings.
front().Shape.Parameters)
1072 return InstructionsState::invalid();
1077 return InstructionsState::invalid();
1080 return InstructionsState(cast<Instruction>(V),
1081 cast<Instruction>(VL[AltIndex]));
1098 unsigned Opcode = UserInst->
getOpcode();
1100 case Instruction::Load: {
1101 LoadInst *LI = cast<LoadInst>(UserInst);
1104 case Instruction::Store: {
1105 StoreInst *SI = cast<StoreInst>(UserInst);
1106 return (SI->getPointerOperand() == Scalar);
1108 case Instruction::Call: {
1109 CallInst *CI = cast<CallInst>(UserInst);
1112 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1113 Arg.value().get() == Scalar;
1125 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1132 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1133 return LI->isSimple();
1135 return SI->isSimple();
1137 return !
MI->isVolatile();
1145 bool ExtendingManyInputs =
false) {
1146 if (SubMask.
empty())
1149 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1152 "SubMask with many inputs support must be larger than the mask.");
1154 Mask.append(SubMask.
begin(), SubMask.
end());
1158 int TermValue = std::min(Mask.size(), SubMask.
size());
1159 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1161 (!ExtendingManyInputs &&
1162 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1164 NewMask[
I] = Mask[SubMask[
I]];
1180 const unsigned Sz = Order.
size();
1183 for (
unsigned I = 0;
I < Sz; ++
I) {
1185 UnusedIndices.
reset(Order[
I]);
1187 MaskedIndices.
set(
I);
1189 if (MaskedIndices.
none())
1192 "Non-synced masked/available indices.");
1196 assert(
Idx >= 0 &&
"Indices must be synced.");
1207 Type *ScalarTy = VL[0]->getType();
1210 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1211 if (isa<PoisonValue>(VL[Lane]))
1213 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1214 OpcodeMask.
set(Lane * ScalarTyNumElements,
1215 Lane * ScalarTyNumElements + ScalarTyNumElements);
1225 const unsigned E = Indices.
size();
1227 for (
unsigned I = 0;
I < E; ++
I)
1228 Mask[Indices[
I]] =
I;
1234 assert(!Mask.empty() &&
"Expected non-empty mask.");
1238 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1240 Scalars[Mask[
I]] = Prev[
I];
1248 auto *
I = dyn_cast<Instruction>(V);
1253 auto *IO = dyn_cast<Instruction>(V);
1256 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1265 auto *
I = dyn_cast<Instruction>(V);
1269 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1271 auto *IU = dyn_cast<Instruction>(U);
1274 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1290 return !VL.
empty() &&
1306 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1310namespace slpvectorizer {
1315 struct ScheduleData;
1339 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1340 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1391 return !VectorizableTree.
empty() &&
1392 !VectorizableTree.
front()->UserTreeIndices.empty();
1397 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1398 return VectorizableTree.
front()->Scalars;
1404 const TreeEntry &Root = *VectorizableTree.
front().get();
1405 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1406 !Root.Scalars.front()->getType()->isIntegerTy())
1407 return std::nullopt;
1408 auto It = MinBWs.
find(&Root);
1409 if (It != MinBWs.
end())
1413 if (Root.getOpcode() == Instruction::ZExt ||
1414 Root.getOpcode() == Instruction::SExt)
1415 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1416 Root.getOpcode() == Instruction::SExt);
1417 return std::nullopt;
1423 return MinBWs.
at(VectorizableTree.
front().get()).second;
1428 if (ReductionBitWidth == 0 ||
1429 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1430 ReductionBitWidth >=
1431 DL->getTypeSizeInBits(
1432 VectorizableTree.
front()->Scalars.front()->getType()))
1434 VectorizableTree.
front()->Scalars.front()->getType(),
1435 VectorizableTree.
front()->getVectorFactor());
1438 VectorizableTree.
front()->Scalars.front()->getContext(),
1440 VectorizableTree.
front()->getVectorFactor());
1455 VectorizableTree.
clear();
1456 ScalarToTreeEntry.clear();
1457 MultiNodeScalars.clear();
1459 NonScheduledFirst.
clear();
1460 EntryToLastInstruction.clear();
1461 LoadEntriesToVectorize.
clear();
1462 IsGraphTransformMode =
false;
1463 GatheredLoadsEntriesFirst.reset();
1464 ExternalUses.
clear();
1465 ExternalUsesAsOriginalScalar.clear();
1466 for (
auto &Iter : BlocksSchedules) {
1467 BlockScheduling *BS = Iter.second.get();
1471 ReductionBitWidth = 0;
1473 CastMaxMinBWSizes.reset();
1474 ExtraBitWidthNodes.
clear();
1475 InstrElementSize.clear();
1476 UserIgnoreList =
nullptr;
1477 PostponedGathers.
clear();
1478 ValueToGatherNodes.
clear();
1494 assert(!Order.
empty() &&
"expected non-empty order");
1495 const unsigned Sz = Order.
size();
1497 return P.value() ==
P.index() ||
P.value() == Sz;
1550 return MaxVecRegSize;
1555 return MinVecRegSize;
1563 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1565 return MaxVF ? MaxVF : UINT_MAX;
1617 unsigned *BestVF =
nullptr,
1618 bool TryRecursiveCheck =
true)
const;
1626 template <
typename T>
1653 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1654 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1676 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1677 MaxLevel(MaxLevel) {}
1731 if (isa<LoadInst>(V1)) {
1733 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1738 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1740 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1743 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1746 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1748 ((
int)V1->getNumUses() == NumLanes ||
1749 AllUsersAreInternal(V1, V2)))
1755 auto CheckSameEntryOrFail = [&]() {
1756 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1757 TE1 && TE1 == R.getTreeEntry(V2))
1762 auto *LI1 = dyn_cast<LoadInst>(V1);
1763 auto *LI2 = dyn_cast<LoadInst>(V2);
1765 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1767 return CheckSameEntryOrFail();
1770 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1771 LI2->getPointerOperand(),
DL, SE,
true);
1772 if (!Dist || *Dist == 0) {
1775 R.TTI->isLegalMaskedGather(
1778 return CheckSameEntryOrFail();
1782 if (std::abs(*Dist) > NumLanes / 2)
1791 auto *C1 = dyn_cast<Constant>(V1);
1792 auto *C2 = dyn_cast<Constant>(V2);
1806 if (isa<UndefValue>(V2))
1810 Value *EV2 =
nullptr;
1823 int Dist = Idx2 - Idx1;
1826 if (std::abs(Dist) == 0)
1828 if (std::abs(Dist) > NumLanes / 2)
1835 return CheckSameEntryOrFail();
1838 auto *I1 = dyn_cast<Instruction>(V1);
1839 auto *I2 = dyn_cast<Instruction>(V2);
1841 if (I1->getParent() != I2->getParent())
1842 return CheckSameEntryOrFail();
1849 if (S.getOpcode() &&
1850 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1851 !S.isAltShuffle()) &&
1853 return isa<PoisonValue>(V) ||
1854 cast<Instruction>(V)->getNumOperands() ==
1855 S.getMainOp()->getNumOperands();
1861 if (I1 && isa<PoisonValue>(V2))
1864 if (isa<UndefValue>(V2))
1867 return CheckSameEntryOrFail();
1901 int ShallowScoreAtThisLevel =
1910 auto *I1 = dyn_cast<Instruction>(
LHS);
1911 auto *I2 = dyn_cast<Instruction>(
RHS);
1912 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1914 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1915 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1916 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1917 ShallowScoreAtThisLevel))
1918 return ShallowScoreAtThisLevel;
1919 assert(I1 && I2 &&
"Should have early exited.");
1926 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1927 OpIdx1 != NumOperands1; ++OpIdx1) {
1929 int MaxTmpScore = 0;
1930 unsigned MaxOpIdx2 = 0;
1931 bool FoundBest =
false;
1935 ? I2->getNumOperands()
1936 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1937 assert(FromIdx <= ToIdx &&
"Bad index");
1938 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1940 if (Op2Used.
count(OpIdx2))
1945 I1, I2, CurrLevel + 1, {});
1948 TmpScore > MaxTmpScore) {
1949 MaxTmpScore = TmpScore;
1956 Op2Used.
insert(MaxOpIdx2);
1957 ShallowScoreAtThisLevel += MaxTmpScore;
1960 return ShallowScoreAtThisLevel;
1991 struct OperandData {
1992 OperandData() =
default;
1993 OperandData(
Value *V,
bool APO,
bool IsUsed)
1994 : V(V), APO(APO), IsUsed(IsUsed) {}
2004 bool IsUsed =
false;
2013 enum class ReorderingMode {
2027 unsigned ArgSize = 0;
2033 const Loop *L =
nullptr;
2036 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2037 return OpsVec[OpIdx][Lane];
2041 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2042 return OpsVec[OpIdx][Lane];
2047 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2048 OpIdx != NumOperands; ++OpIdx)
2049 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2051 OpsVec[OpIdx][Lane].IsUsed =
false;
2055 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2056 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2068 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2070 Value *IdxLaneV = getData(
Idx, Lane).V;
2071 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2072 isa<ExtractElementInst>(IdxLaneV))
2075 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2078 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2079 if (!isa<Instruction>(OpIdxLnV))
2083 unsigned UniquesCount = Uniques.
size();
2084 auto IdxIt = Uniques.
find(IdxLaneV);
2085 unsigned UniquesCntWithIdxLaneV =
2086 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2087 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2088 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2089 unsigned UniquesCntWithOpIdxLaneV =
2090 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2091 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2093 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2094 UniquesCntWithOpIdxLaneV,
2095 UniquesCntWithOpIdxLaneV -
2097 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2098 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2099 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2108 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2109 Value *IdxLaneV = getData(
Idx, Lane).V;
2110 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2119 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2120 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2122 return R.areAllUsersVectorized(IdxLaneI)
2130 static const int ScoreScaleFactor = 10;
2138 int Lane,
unsigned OpIdx,
unsigned Idx,
2148 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2149 if (Score <= -SplatScore) {
2153 Score += SplatScore;
2159 Score *= ScoreScaleFactor;
2160 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2178 std::optional<unsigned>
2179 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2183 unsigned NumOperands = getNumOperands();
2186 Value *OpLastLane = getData(OpIdx, LastLane).V;
2189 ReorderingMode RMode = ReorderingModes[OpIdx];
2190 if (RMode == ReorderingMode::Failed)
2191 return std::nullopt;
2194 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2200 std::optional<unsigned>
Idx;
2204 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2210 bool IsUsed = RMode == ReorderingMode::Splat ||
2211 RMode == ReorderingMode::Constant ||
2212 RMode == ReorderingMode::Load;
2214 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2216 OperandData &OpData = getData(
Idx, Lane);
2218 bool OpAPO = OpData.APO;
2227 if (OpAPO != OpIdxAPO)
2232 case ReorderingMode::Load:
2233 case ReorderingMode::Opcode: {
2234 bool LeftToRight = Lane > LastLane;
2235 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2236 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2237 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2238 OpIdx,
Idx, IsUsed, UsedLanes);
2239 if (Score >
static_cast<int>(BestOp.Score) ||
2240 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2243 BestOp.Score = Score;
2244 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2248 case ReorderingMode::Constant:
2249 if (isa<Constant>(
Op) ||
2250 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2252 if (isa<Constant>(
Op)) {
2254 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2257 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2261 case ReorderingMode::Splat:
2262 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2263 IsUsed =
Op == OpLastLane;
2264 if (
Op == OpLastLane) {
2266 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2272 case ReorderingMode::Failed:
2278 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2282 return std::nullopt;
2289 unsigned getBestLaneToStartReordering()
const {
2290 unsigned Min = UINT_MAX;
2291 unsigned SameOpNumber = 0;
2302 for (
int I = getNumLanes();
I > 0; --
I) {
2303 unsigned Lane =
I - 1;
2304 OperandsOrderData NumFreeOpsHash =
2305 getMaxNumOperandsThatCanBeReordered(Lane);
2308 if (NumFreeOpsHash.NumOfAPOs < Min) {
2309 Min = NumFreeOpsHash.NumOfAPOs;
2310 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2312 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2313 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2314 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2317 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2321 auto [It, Inserted] =
2322 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2328 unsigned BestLane = 0;
2329 unsigned CntMin = UINT_MAX;
2331 if (
Data.second.first < CntMin) {
2332 CntMin =
Data.second.first;
2333 BestLane =
Data.second.second;
2340 struct OperandsOrderData {
2343 unsigned NumOfAPOs = UINT_MAX;
2346 unsigned NumOpsWithSameOpcodeParent = 0;
2360 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2361 unsigned CntTrue = 0;
2362 unsigned NumOperands = getNumOperands();
2372 bool AllUndefs =
true;
2373 unsigned NumOpsWithSameOpcodeParent = 0;
2377 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2378 const OperandData &OpData = getData(OpIdx, Lane);
2383 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2385 I->getParent() != Parent) {
2386 if (NumOpsWithSameOpcodeParent == 0) {
2387 NumOpsWithSameOpcodeParent = 1;
2389 Parent =
I->getParent();
2391 --NumOpsWithSameOpcodeParent;
2394 ++NumOpsWithSameOpcodeParent;
2398 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2399 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2403 OperandsOrderData
Data;
2404 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2405 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2413 assert((empty() || VL.
size() == getNumLanes()) &&
2414 "Expected same number of lanes");
2417 constexpr unsigned IntrinsicNumOperands = 2;
2419 ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
2420 OpsVec.
resize(NumOperands);
2421 unsigned NumLanes = VL.
size();
2422 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2423 OpsVec[OpIdx].
resize(NumLanes);
2424 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2425 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2426 "Expected instruction or poison value");
2437 if (isa<PoisonValue>(VL[Lane])) {
2438 OpsVec[OpIdx][Lane] = {
2443 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2444 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2445 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2452 unsigned getNumOperands()
const {
return ArgSize; }
2455 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2458 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2459 return getData(OpIdx, Lane).V;
2463 bool empty()
const {
return OpsVec.
empty(); }
2466 void clear() { OpsVec.
clear(); }
2471 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2472 assert(
Op == getValue(OpIdx, Lane) &&
2473 "Op is expected to be getValue(OpIdx, Lane).");
2475 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2477 bool OpAPO = getData(OpIdx, Lane).APO;
2478 bool IsInvariant = L && L->isLoopInvariant(
Op);
2480 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2484 bool FoundCandidate =
false;
2485 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2486 OperandData &
Data = getData(OpI, Ln);
2487 if (
Data.APO != OpAPO ||
Data.IsUsed)
2489 Value *OpILane = getValue(OpI, Lane);
2490 bool IsConstantOp = isa<Constant>(OpILane);
2499 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2505 isa<Constant>(
Data.V)))) ||
2512 (IsInvariant && !isa<Constant>(
Data.V) &&
2514 L->isLoopInvariant(
Data.V))) {
2515 FoundCandidate =
true;
2522 if (!FoundCandidate)
2525 return getNumLanes() == 2 || Cnt > 1;
2530 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2531 assert(
Op == getValue(OpIdx, Lane) &&
2532 "Op is expected to be getValue(OpIdx, Lane).");
2533 bool OpAPO = getData(OpIdx, Lane).APO;
2534 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2537 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2538 const OperandData &
Data = getData(OpI, Ln);
2539 if (
Data.APO != OpAPO ||
Data.IsUsed)
2541 Value *OpILn = getValue(OpI, Ln);
2542 return (L && L->isLoopInvariant(OpILn)) ||
2554 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2555 L(R.LI->getLoopFor((VL0->
getParent()))) {
2557 appendOperandsOfVL(RootVL, VL0);
2564 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2565 "Expected same num of lanes across all operands");
2566 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2567 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2575 unsigned NumOperands = getNumOperands();
2576 unsigned NumLanes = getNumLanes();
2596 unsigned FirstLane = getBestLaneToStartReordering();
2599 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2600 Value *OpLane0 = getValue(OpIdx, FirstLane);
2603 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2605 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2606 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2607 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2608 else if (isa<LoadInst>(OpILane0))
2609 ReorderingModes[OpIdx] = ReorderingMode::Load;
2611 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2612 }
else if (isa<Constant>(OpLane0)) {
2613 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2614 }
else if (isa<Argument>(OpLane0)) {
2616 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2626 auto &&SkipReordering = [
this]() {
2629 for (
const OperandData &
Data : Op0)
2633 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2652 if (SkipReordering())
2655 bool StrategyFailed =
false;
2663 for (
unsigned I = 0;
I < NumOperands; ++
I)
2664 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2667 UsedLanes.
set(FirstLane);
2668 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2671 int Lane = FirstLane +
Direction * Distance;
2672 if (Lane < 0 || Lane >= (
int)NumLanes)
2674 UsedLanes.
set(Lane);
2676 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2679 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2681 std::optional<unsigned> BestIdx =
2682 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2683 MainAltOps[OpIdx], UsedLanes);
2690 swap(OpIdx, *BestIdx, Lane);
2693 StrategyFailed =
true;
2696 if (MainAltOps[OpIdx].
size() != 2) {
2697 OperandData &AltOp = getData(OpIdx, Lane);
2698 InstructionsState OpS =
2700 if (OpS.getOpcode() && OpS.isAltShuffle())
2707 if (!StrategyFailed)
2712#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2715 case ReorderingMode::Load:
2717 case ReorderingMode::Opcode:
2719 case ReorderingMode::Constant:
2721 case ReorderingMode::Splat:
2723 case ReorderingMode::Failed:
2744 const unsigned Indent = 2;
2747 OS <<
"Operand " << Cnt++ <<
"\n";
2748 for (
const OperandData &OpData : OpDataVec) {
2750 if (
Value *V = OpData.V)
2754 OS <<
", APO:" << OpData.APO <<
"}\n";
2776 int BestScore = Limit;
2777 std::optional<int> Index;
2778 for (
int I : seq<int>(0, Candidates.size())) {
2780 Candidates[
I].second,
2783 if (Score > BestScore) {
2798 DeletedInstructions.insert(
I);
2803 template <
typename T>
2806 for (
T *V : DeadVals) {
2807 auto *
I = cast<Instruction>(V);
2808 DeletedInstructions.insert(
I);
2811 for (
T *V : DeadVals) {
2812 if (!V || !Processed.
insert(V).second)
2814 auto *
I = cast<Instruction>(V);
2817 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2818 Entries.push_back(Entry);
2819 auto It = MultiNodeScalars.find(
I);
2820 if (It != MultiNodeScalars.end())
2821 Entries.append(It->second.begin(), It->second.end());
2823 for (
Use &U :
I->operands()) {
2824 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2825 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2827 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2828 return Entry->VectorizedValue == OpI;
2832 I->dropAllReferences();
2834 for (
T *V : DeadVals) {
2835 auto *
I = cast<Instruction>(V);
2836 if (!
I->getParent())
2841 cast<Instruction>(U.getUser()));
2843 "trying to erase instruction with users.");
2844 I->removeFromParent();
2848 while (!DeadInsts.
empty()) {
2851 if (!VI || !VI->getParent())
2854 "Live instruction found in dead worklist!");
2855 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2862 for (
Use &OpU : VI->operands()) {
2863 Value *OpV = OpU.get();
2874 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2875 if (!DeletedInstructions.contains(OpI) &&
2880 VI->removeFromParent();
2881 DeletedInstructions.insert(VI);
2889 return AnalyzedReductionsRoots.count(
I);
2894 AnalyzedReductionsRoots.insert(
I);
2908 AnalyzedReductionsRoots.clear();
2909 AnalyzedReductionVals.
clear();
2910 AnalyzedMinBWVals.
clear();
2922 return NonScheduledFirst.
contains(V);
2935 bool collectValuesToDemote(
2936 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2939 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2949 canReorderOperands(TreeEntry *UserTE,
2956 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2960 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2962 TreeEntry *TE =
nullptr;
2964 TE = getTreeEntry(V);
2965 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2967 auto It = MultiNodeScalars.find(V);
2968 if (It != MultiNodeScalars.end()) {
2969 for (TreeEntry *E : It->second) {
2970 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2978 if (It != VL.
end()) {
2979 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2987 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2988 unsigned OpIdx)
const {
2989 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2990 const_cast<TreeEntry *
>(UserTE), OpIdx);
2994 bool areAllUsersVectorized(
3003 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3008 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3012 getCastContextHint(
const TreeEntry &TE)
const;
3021 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3032 bool ResizeAllowed =
false)
const;
3041 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3042 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3043 unsigned NodeIdx)
const {
3044 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3051 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3056 template <
typename BVTy,
typename ResTy,
typename...
Args>
3057 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3062 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3063 bool PostponedPHIs);
3069 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3076 std::optional<TargetTransformInfo::ShuffleKind>
3088 unsigned NumParts)
const;
3100 std::optional<TargetTransformInfo::ShuffleKind>
3101 isGatherShuffledSingleRegisterEntry(
3118 isGatherShuffledEntry(
3121 unsigned NumParts,
bool ForOrder =
false);
3127 Type *ScalarTy)
const;
3131 void setInsertPointAfterBundle(
const TreeEntry *E);
3141 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3146 void tryToVectorizeGatheredLoads(
3155 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3171 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3175 void reorderGatherNode(TreeEntry &TE);
3179 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3196 [Scalars](
Value *V,
int Idx) {
3197 return (isa<UndefValue>(V) &&
3198 Idx == PoisonMaskElem) ||
3199 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3202 if (!ReorderIndices.empty()) {
3209 return IsSame(Scalars, Mask);
3210 if (VL.
size() == ReuseShuffleIndices.size()) {
3212 return IsSame(Scalars, Mask);
3216 return IsSame(Scalars, ReuseShuffleIndices);
3219 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3220 return isGather() && !UserTreeIndices.empty() &&
3221 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3222 UserTreeIndices.front().UserTE == UserEI.UserTE;
3226 bool hasEqualOperands(
const TreeEntry &TE)
const {
3227 if (
TE.getNumOperands() != getNumOperands())
3230 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3231 unsigned PrevCount =
Used.count();
3232 for (
unsigned K = 0;
K < E; ++
K) {
3235 if (getOperand(K) ==
TE.getOperand(
I)) {
3241 if (PrevCount ==
Used.count())
3250 unsigned getVectorFactor()
const {
3251 if (!ReuseShuffleIndices.empty())
3252 return ReuseShuffleIndices.size();
3253 return Scalars.
size();
3257 bool isGather()
const {
return State == NeedToGather; }
3284 enum CombinedOpcode {
3286 MinMax = Instruction::OtherOpsEnd + 1,
3288 CombinedOpcode CombinedOp = NotCombinedOp;
3302 VecTreeTy &Container;
3326 unsigned InterleaveFactor = 0;
3330 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3332 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3338 assert(Operands[OpIdx].empty() &&
"Already resized?");
3340 "Number of operands is greater than the number of scalars.");
3346 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3347 VLOperands Ops(Scalars, MainOp, R);
3351 setOperand(
I, Ops.getVL(
I));
3373 unsigned getNumOperands()
const {
return Operands.size(); }
3376 Value *getSingleOperand(
unsigned OpIdx)
const {
3378 assert(!Operands[OpIdx].empty() &&
"No operand available");
3383 bool isAltShuffle()
const {
return MainOp != AltOp; }
3386 unsigned CheckedOpcode =
I->getOpcode();
3387 return (getOpcode() == CheckedOpcode ||
3388 getAltOpcode() == CheckedOpcode);
3395 auto *
I = dyn_cast<Instruction>(
Op);
3396 if (
I && isOpcodeOrAlt(
I))
3401 void setOperations(
const InstructionsState &S) {
3402 MainOp = S.getMainOp();
3403 AltOp = S.getAltOp();
3415 unsigned getOpcode()
const {
3416 return MainOp ? MainOp->
getOpcode() : 0;
3419 unsigned getAltOpcode()
const {
3425 int findLaneForValue(
Value *V)
const {
3426 unsigned FoundLane = getVectorFactor();
3427 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3428 std::advance(It, 1)) {
3431 FoundLane = std::distance(Scalars.begin(), It);
3432 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3433 if (!ReorderIndices.
empty())
3434 FoundLane = ReorderIndices[FoundLane];
3435 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3436 if (ReuseShuffleIndices.
empty())
3438 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3439 RIt != ReuseShuffleIndices.
end()) {
3440 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3444 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3457 bool isNonPowOf2Vec()
const {
3459 return IsNonPowerOf2;
3468 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3469 "Reshuffling not supported with non-power-of-2 vectors yet.");
3470 return IsNonPowerOf2;
3473 Value *getOrdered(
unsigned Idx)
const {
3474 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3475 if (ReorderIndices.
empty())
3476 return Scalars[
Idx];
3486 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3487 dbgs() <<
"Operand " << OpI <<
":\n";
3488 for (
const Value *V : Operands[OpI])
3491 dbgs() <<
"Scalars: \n";
3492 for (
Value *V : Scalars)
3494 dbgs() <<
"State: ";
3497 if (InterleaveFactor > 0) {
3498 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3501 dbgs() <<
"Vectorize\n";
3504 case ScatterVectorize:
3505 dbgs() <<
"ScatterVectorize\n";
3507 case StridedVectorize:
3508 dbgs() <<
"StridedVectorize\n";
3511 dbgs() <<
"NeedToGather\n";
3513 case CombinedVectorize:
3514 dbgs() <<
"CombinedVectorize\n";
3517 dbgs() <<
"MainOp: ";
3519 dbgs() << *MainOp <<
"\n";
3522 dbgs() <<
"AltOp: ";
3524 dbgs() << *AltOp <<
"\n";
3527 dbgs() <<
"VectorizedValue: ";
3528 if (VectorizedValue)
3529 dbgs() << *VectorizedValue <<
"\n";
3532 dbgs() <<
"ReuseShuffleIndices: ";
3533 if (ReuseShuffleIndices.
empty())
3536 for (
int ReuseIdx : ReuseShuffleIndices)
3537 dbgs() << ReuseIdx <<
", ";
3539 dbgs() <<
"ReorderIndices: ";
3540 for (
unsigned ReorderIdx : ReorderIndices)
3541 dbgs() << ReorderIdx <<
", ";
3543 dbgs() <<
"UserTreeIndices: ";
3544 for (
const auto &EInfo : UserTreeIndices)
3545 dbgs() << EInfo <<
", ";
3552 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3555 dbgs() <<
"SLP: " << Banner <<
":\n";
3557 dbgs() <<
"SLP: Costs:\n";
3558 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3559 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3560 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3561 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3562 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3568 std::optional<ScheduleData *> Bundle,
3569 const InstructionsState &S,
3570 const EdgeInfo &UserTreeIdx,
3573 unsigned InterleaveFactor = 0) {
3574 TreeEntry::EntryState EntryState =
3575 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3576 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3577 ReuseShuffleIndices, ReorderIndices);
3578 if (E && InterleaveFactor > 0)
3579 E->setInterleave(InterleaveFactor);
3584 TreeEntry::EntryState EntryState,
3585 std::optional<ScheduleData *> Bundle,
3586 const InstructionsState &S,
3587 const EdgeInfo &UserTreeIdx,
3590 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3591 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3592 "Need to vectorize gather entry?");
3594 if (GatheredLoadsEntriesFirst.has_value() &&
3595 EntryState == TreeEntry::NeedToGather &&
3596 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3597 !UserTreeIdx.UserTE)
3599 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3600 TreeEntry *
Last = VectorizableTree.
back().get();
3601 Last->Idx = VectorizableTree.
size() - 1;
3602 Last->State = EntryState;
3607 ReuseShuffleIndices.empty()) &&
3608 "Reshuffling scalars not yet supported for nodes with padding");
3609 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3610 ReuseShuffleIndices.end());
3611 if (ReorderIndices.
empty()) {
3613 Last->setOperations(S);
3616 Last->Scalars.assign(VL.
size(),
nullptr);
3619 if (Idx >= VL.size())
3620 return UndefValue::get(VL.front()->getType());
3624 Last->setOperations(S);
3625 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3627 if (!
Last->isGather()) {
3628 for (
Value *V : VL) {
3629 const TreeEntry *
TE = getTreeEntry(V);
3631 "Scalar already in tree!");
3634 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3637 ScalarToTreeEntry[
V] =
Last;
3640 ScheduleData *BundleMember = *Bundle;
3641 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3644 "Bundle and VL out of sync");
3646 for (
Value *V : VL) {
3651 BundleMember->TE =
Last;
3652 BundleMember = BundleMember->NextInBundle;
3655 assert(!BundleMember &&
"Bundle and VL out of sync");
3658 bool AllConstsOrCasts =
true;
3661 auto *
I = dyn_cast<CastInst>(V);
3662 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3663 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3664 !UserTreeIdx.UserTE->isGather())
3667 if (AllConstsOrCasts)
3669 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3670 MustGather.
insert(VL.begin(), VL.end());
3673 if (UserTreeIdx.UserTE)
3674 Last->UserTreeIndices.push_back(UserTreeIdx);
3680 TreeEntry::VecTreeTy VectorizableTree;
3685 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3686 VectorizableTree[
Id]->dump();
3692 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3694 const TreeEntry *getTreeEntry(
Value *V)
const {
3695 return ScalarToTreeEntry.lookup(V);
3704 bool areAltOperandsProfitable(
const InstructionsState &S,
3709 TreeEntry::EntryState
3711 bool IsScatterVectorizeUserTE,
3744 using ValueToGatherNodesMap =
3746 ValueToGatherNodesMap ValueToGatherNodes;
3754 bool IsGraphTransformMode =
false;
3757 std::optional<unsigned> GatheredLoadsEntriesFirst;
3760 struct ExternalUser {
3784 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3785 auto It = AliasCache.
find(Key);
3786 if (It != AliasCache.
end())
3791 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3795 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3827 UserList ExternalUses;
3850 struct ScheduleData {
3853 enum { InvalidDeps = -1 };
3855 ScheduleData() =
default;
3858 FirstInBundle =
this;
3859 NextInBundle =
nullptr;
3860 NextLoadStore =
nullptr;
3861 IsScheduled =
false;
3862 SchedulingRegionID = BlockSchedulingRegionID;
3863 clearDependencies();
3870 if (hasValidDependencies()) {
3871 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3873 assert(UnscheduledDeps == Dependencies &&
"invariant");
3877 assert(isSchedulingEntity() &&
3878 "unexpected scheduled state");
3879 for (
const ScheduleData *BundleMember =
this; BundleMember;
3880 BundleMember = BundleMember->NextInBundle) {
3881 assert(BundleMember->hasValidDependencies() &&
3882 BundleMember->UnscheduledDeps == 0 &&
3883 "unexpected scheduled state");
3884 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3885 "only bundle is marked scheduled");
3889 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3890 "all bundle members must be in same basic block");
3896 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3900 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3904 bool isPartOfBundle()
const {
3905 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3910 bool isReady()
const {
3911 assert(isSchedulingEntity() &&
3912 "can't consider non-scheduling entity for ready list");
3913 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3919 int incrementUnscheduledDeps(
int Incr) {
3920 assert(hasValidDependencies() &&
3921 "increment of unscheduled deps would be meaningless");
3922 UnscheduledDeps += Incr;
3923 return FirstInBundle->unscheduledDepsInBundle();
3928 void resetUnscheduledDeps() {
3929 UnscheduledDeps = Dependencies;
3933 void clearDependencies() {
3934 Dependencies = InvalidDeps;
3935 resetUnscheduledDeps();
3936 MemoryDependencies.clear();
3937 ControlDependencies.clear();
3940 int unscheduledDepsInBundle()
const {
3941 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3943 for (
const ScheduleData *BundleMember =
this; BundleMember;
3944 BundleMember = BundleMember->NextInBundle) {
3945 if (BundleMember->UnscheduledDeps == InvalidDeps)
3947 Sum += BundleMember->UnscheduledDeps;
3953 if (!isSchedulingEntity()) {
3954 os <<
"/ " << *Inst;
3955 }
else if (NextInBundle) {
3957 ScheduleData *SD = NextInBundle;
3959 os <<
';' << *SD->Inst;
3960 SD = SD->NextInBundle;
3971 TreeEntry *
TE =
nullptr;
3975 ScheduleData *FirstInBundle =
nullptr;
3979 ScheduleData *NextInBundle =
nullptr;
3983 ScheduleData *NextLoadStore =
nullptr;
3997 int SchedulingRegionID = 0;
4000 int SchedulingPriority = 0;
4006 int Dependencies = InvalidDeps;
4012 int UnscheduledDeps = InvalidDeps;
4016 bool IsScheduled =
false;
4021 const BoUpSLP::ScheduleData &SD) {
4046 struct BlockScheduling {
4048 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4052 ScheduleStart =
nullptr;
4053 ScheduleEnd =
nullptr;
4054 FirstLoadStoreInRegion =
nullptr;
4055 LastLoadStoreInRegion =
nullptr;
4056 RegionHasStackSave =
false;
4060 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4063 ScheduleRegionSize = 0;
4067 ++SchedulingRegionID;
4071 if (BB !=
I->getParent())
4074 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4075 if (SD && isInSchedulingRegion(SD))
4080 ScheduleData *getScheduleData(
Value *V) {
4081 if (
auto *
I = dyn_cast<Instruction>(V))
4082 return getScheduleData(
I);
4086 bool isInSchedulingRegion(ScheduleData *SD)
const {
4087 return SD->SchedulingRegionID == SchedulingRegionID;
4092 template <
typename ReadyListType>
4093 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4094 SD->IsScheduled =
true;
4097 for (ScheduleData *BundleMember = SD; BundleMember;
4098 BundleMember = BundleMember->NextInBundle) {
4103 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4104 ScheduleData *OpDef = getScheduleData(
I);
4105 if (OpDef && OpDef->hasValidDependencies() &&
4106 OpDef->incrementUnscheduledDeps(-1) == 0) {
4110 ScheduleData *DepBundle = OpDef->FirstInBundle;
4111 assert(!DepBundle->IsScheduled &&
4112 "already scheduled bundle gets ready");
4113 ReadyList.insert(DepBundle);
4115 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4122 if (TreeEntry *TE = BundleMember->TE) {
4124 int Lane = std::distance(
TE->Scalars.begin(),
4125 find(
TE->Scalars, BundleMember->Inst));
4126 assert(Lane >= 0 &&
"Lane not set");
4134 auto *
In = BundleMember->Inst;
4137 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4138 In->getNumOperands() ==
TE->getNumOperands()) &&
4139 "Missed TreeEntry operands?");
4142 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4143 OpIdx != NumOperands; ++OpIdx)
4144 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4149 for (
Use &U : BundleMember->Inst->operands())
4150 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4154 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4155 if (MemoryDepSD->hasValidDependencies() &&
4156 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4159 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4160 assert(!DepBundle->IsScheduled &&
4161 "already scheduled bundle gets ready");
4162 ReadyList.insert(DepBundle);
4164 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4168 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4169 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4172 ScheduleData *DepBundle = DepSD->FirstInBundle;
4173 assert(!DepBundle->IsScheduled &&
4174 "already scheduled bundle gets ready");
4175 ReadyList.insert(DepBundle);
4177 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4188 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4189 ScheduleStart->comesBefore(ScheduleEnd) &&
4190 "Not a valid scheduling region?");
4192 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4193 auto *SD = getScheduleData(
I);
4196 assert(isInSchedulingRegion(SD) &&
4197 "primary schedule data not in window?");
4198 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4199 "entire bundle in window!");
4203 for (
auto *SD : ReadyInsts) {
4204 assert(SD->isSchedulingEntity() && SD->isReady() &&
4205 "item in ready list not ready?");
4211 template <
typename ReadyListType>
4212 void initialFillReadyList(ReadyListType &ReadyList) {
4213 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4214 ScheduleData *SD = getScheduleData(
I);
4215 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4217 ReadyList.insert(SD);
4219 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4233 std::optional<ScheduleData *>
4235 const InstructionsState &S);
4241 ScheduleData *allocateScheduleDataChunks();
4245 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4250 ScheduleData *PrevLoadStore,
4251 ScheduleData *NextLoadStore);
4255 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4259 void resetSchedule();
4289 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4293 ScheduleData *LastLoadStoreInRegion =
nullptr;
4298 bool RegionHasStackSave =
false;
4301 int ScheduleRegionSize = 0;
4310 int SchedulingRegionID = 1;
4318 void scheduleBlock(BlockScheduling *BS);
4325 struct OrdersTypeDenseMapInfo {
4338 static unsigned getHashValue(
const OrdersType &V) {
4359 unsigned MaxVecRegSize;
4360 unsigned MinVecRegSize;
4375 unsigned ReductionBitWidth = 0;
4378 unsigned BaseGraphSize = 1;
4382 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4401 struct ChildIteratorType
4403 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4414 return R.VectorizableTree[0].get();
4418 return {
N->UserTreeIndices.begin(),
N->Container};
4422 return {
N->UserTreeIndices.end(),
N->Container};
4427 class nodes_iterator {
4438 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4442 return nodes_iterator(R->VectorizableTree.begin());
4446 return nodes_iterator(R->VectorizableTree.end());
4449 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4460 OS << Entry->Idx <<
".\n";
4463 for (
auto *V : Entry->Scalars) {
4465 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4466 return EU.Scalar == V;
4476 if (Entry->isGather())
4478 if (Entry->State == TreeEntry::ScatterVectorize ||
4479 Entry->State == TreeEntry::StridedVectorize)
4480 return "color=blue";
4489 for (
auto *
I : DeletedInstructions) {
4490 if (!
I->getParent()) {
4493 if (isa<PHINode>(
I))
4495 I->insertBefore(
F->getEntryBlock(),
4496 F->getEntryBlock().getFirstNonPHIIt());
4498 I->insertBefore(
F->getEntryBlock().getTerminator());
4501 for (
Use &U :
I->operands()) {
4502 auto *
Op = dyn_cast<Instruction>(U.get());
4503 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4507 I->dropAllReferences();
4509 for (
auto *
I : DeletedInstructions) {
4511 "trying to erase instruction with users.");
4512 I->eraseFromParent();
4518#ifdef EXPENSIVE_CHECKS
4529 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4530 "Expected non-empty mask.");
4533 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4535 Reuses[Mask[
I]] = Prev[
I];
4543 bool BottomOrder =
false) {
4544 assert(!Mask.empty() &&
"Expected non-empty mask.");
4545 unsigned Sz = Mask.size();
4548 if (Order.
empty()) {
4550 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4552 PrevOrder.
swap(Order);
4555 for (
unsigned I = 0;
I < Sz; ++
I)
4557 Order[
I] = PrevOrder[Mask[
I]];
4559 return Data.value() == Sz ||
Data.index() ==
Data.value();
4568 if (Order.
empty()) {
4570 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4580 for (
unsigned I = 0;
I < Sz; ++
I)
4582 Order[MaskOrder[
I]] =
I;
4586std::optional<BoUpSLP::OrdersType>
4588 assert(TE.isGather() &&
"Expected gather node only.");
4592 Type *ScalarTy = GatheredScalars.
front()->getType();
4593 int NumScalars = GatheredScalars.
size();
4595 return std::nullopt;
4598 if (NumParts == 0 || NumParts >= NumScalars ||
4599 VecTy->getNumElements() % NumParts != 0 ||
4601 VecTy->getNumElements() / NumParts))
4607 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4609 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4612 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4613 return std::nullopt;
4614 OrdersType CurrentOrder(NumScalars, NumScalars);
4615 if (GatherShuffles.
size() == 1 &&
4617 Entries.front().front()->isSame(TE.Scalars)) {
4620 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4621 return CurrentOrder;
4625 return all_of(Mask, [&](
int I) {
4632 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4633 (Entries.size() != 1 ||
4634 Entries.front().front()->ReorderIndices.empty())) ||
4635 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4636 return std::nullopt;
4641 for (
int I : seq<int>(0, NumParts)) {
4642 if (ShuffledSubMasks.
test(
I))
4644 const int VF = GetVF(
I);
4650 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4651 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4652 ShuffledSubMasks.
set(
I);
4656 int FirstMin = INT_MAX;
4657 int SecondVecFound =
false;
4658 for (
int K : seq<int>(Limit)) {
4659 int Idx = Mask[
I * PartSz + K];
4661 Value *V = GatheredScalars[
I * PartSz + K];
4663 SecondVecFound =
true;
4672 SecondVecFound =
true;
4676 FirstMin = (FirstMin / PartSz) * PartSz;
4678 if (SecondVecFound) {
4679 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4680 ShuffledSubMasks.
set(
I);
4683 for (
int K : seq<int>(Limit)) {
4684 int Idx = Mask[
I * PartSz + K];
4688 if (
Idx >= PartSz) {
4689 SecondVecFound =
true;
4692 if (CurrentOrder[
I * PartSz +
Idx] >
4693 static_cast<unsigned>(
I * PartSz + K) &&
4694 CurrentOrder[
I * PartSz +
Idx] !=
4695 static_cast<unsigned>(
I * PartSz +
Idx))
4696 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4699 if (SecondVecFound) {
4700 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4701 ShuffledSubMasks.
set(
I);
4707 if (!ExtractShuffles.
empty())
4708 TransformMaskToOrder(
4709 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4710 if (!ExtractShuffles[
I])
4713 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4714 for (
unsigned Idx : seq<unsigned>(Sz)) {
4715 int K =
I * PartSz +
Idx;
4718 if (!TE.ReuseShuffleIndices.empty())
4719 K = TE.ReuseShuffleIndices[K];
4722 if (!TE.ReorderIndices.empty())
4723 K = std::distance(TE.ReorderIndices.begin(),
4724 find(TE.ReorderIndices, K));
4725 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4728 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4730 .getKnownMinValue());
4735 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4736 if (ShuffledSubMasks.
any())
4737 return std::nullopt;
4738 PartSz = NumScalars;
4741 if (!Entries.empty())
4742 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4743 if (!GatherShuffles[
I])
4745 return std::max(Entries[
I].front()->getVectorFactor(),
4746 Entries[
I].back()->getVectorFactor());
4749 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4750 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4751 return std::nullopt;
4752 return std::move(CurrentOrder);
4757 bool CompareOpcodes =
true) {
4761 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4762 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4763 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4764 (!GEP2 || GEP2->getNumOperands() == 2) &&
4765 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4766 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4769 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4774template <
typename T>
4776 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4778 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4779 return CommonAlignment;
4785 "Order is empty. Please check it before using isReverseOrder.");
4786 unsigned Sz = Order.
size();
4788 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4799static std::optional<Value *>
4805 const SCEV *PtrSCEVLowest =
nullptr;
4806 const SCEV *PtrSCEVHighest =
nullptr;
4812 return std::nullopt;
4814 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4815 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4819 if (isa<SCEVCouldNotCompute>(Diff))
4820 return std::nullopt;
4822 PtrSCEVLowest = PtrSCEV;
4826 if (isa<SCEVCouldNotCompute>(Diff1))
4827 return std::nullopt;
4829 PtrSCEVHighest = PtrSCEV;
4835 if (isa<SCEVCouldNotCompute>(Dist))
4836 return std::nullopt;
4837 int Size =
DL.getTypeStoreSize(ElemTy);
4838 auto TryGetStride = [&](
const SCEV *Dist,
4839 const SCEV *Multiplier) ->
const SCEV * {
4840 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4841 if (M->getOperand(0) == Multiplier)
4842 return M->getOperand(1);
4843 if (M->getOperand(1) == Multiplier)
4844 return M->getOperand(0);
4847 if (Multiplier == Dist)
4852 const SCEV *Stride =
nullptr;
4853 if (
Size != 1 || SCEVs.
size() > 2) {
4855 Stride = TryGetStride(Dist, Sz);
4857 return std::nullopt;
4859 if (!Stride || isa<SCEVConstant>(Stride))
4860 return std::nullopt;
4863 using DistOrdPair = std::pair<int64_t, int>;
4865 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4867 bool IsConsecutive =
true;
4868 for (
const SCEV *PtrSCEV : SCEVs) {
4870 if (PtrSCEV != PtrSCEVLowest) {
4872 const SCEV *Coeff = TryGetStride(Diff, Stride);
4874 return std::nullopt;
4875 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4876 if (!SC || isa<SCEVCouldNotCompute>(SC))
4877 return std::nullopt;
4881 return std::nullopt;
4882 Dist = SC->getAPInt().getZExtValue();
4886 return std::nullopt;
4887 auto Res = Offsets.emplace(Dist, Cnt);
4889 return std::nullopt;
4891 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4894 if (Offsets.size() != SCEVs.
size())
4895 return std::nullopt;
4896 SortedIndices.
clear();
4897 if (!IsConsecutive) {
4901 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4902 SortedIndices[Cnt] = Pair.second;
4912static std::pair<InstructionCost, InstructionCost>
4928 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4931 Mask, NumSrcElts, NumSubElts,
Index)) {
4932 if (
Index + NumSubElts > NumSrcElts &&
4933 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4946 unsigned *BestVF,
bool TryRecursiveCheck)
const {
4959 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4965 const unsigned Sz = VL.
size();
4967 auto *POIter = PointerOps.
begin();
4968 for (
Value *V : VL) {
4969 auto *L = dyn_cast<LoadInst>(V);
4970 if (!L || !L->isSimple())
4972 *POIter = L->getPointerOperand();
4981 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5001 if (Order.
empty()) {
5002 Ptr0 = PointerOps.
front();
5003 PtrN = PointerOps.
back();
5005 Ptr0 = PointerOps[Order.
front()];
5006 PtrN = PointerOps[Order.
back()];
5008 std::optional<int> Diff =
5011 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5017 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5031 auto IsAnyPointerUsedOutGraph =
5032 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5033 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5034 return !getTreeEntry(U) && !MustGather.contains(U);
5037 const unsigned AbsoluteDiff = std::abs(*Diff);
5038 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
5042 AbsoluteDiff > Sz) ||
5043 *Diff == -(
static_cast<int>(Sz) - 1))) {
5044 int Stride = *Diff /
static_cast<int>(Sz - 1);
5045 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5057 else if (
Ptr != Ptr0)
5061 if (((Dist / Stride) * Stride) != Dist ||
5062 !Dists.
insert(Dist).second)
5065 if (Dists.
size() == Sz)
5074 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5076 bool ProfitableGatherPointers) {
5081 auto [ScalarGEPCost, VectorGEPCost] =
5083 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5089 VecTy->getNumElements());
5090 if (
static_cast<unsigned>(
count_if(
5091 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5097 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5116 false, CommonAlignment,
CostKind) +
5117 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5124 constexpr unsigned ListLimit = 4;
5125 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5134 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5139 for (
unsigned VF = VL.
size() / 2; VF >= MinVF; VF /= 2) {
5141 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5154 DemandedElts.
setBits(Cnt, Cnt + VF);
5169 if (!DemandedElts.
isZero()) {
5174 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5175 if (DemandedElts[
Idx])
5182 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5187 LI0->getPointerOperand(),
5188 Instruction::GetElementPtr,
CostKind, ScalarTy,
5192 if (
static_cast<unsigned>(
5193 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5194 PointerOps.
size() - 1 ||
5214 LI0->getPointerAddressSpace(),
CostKind,
5220 LI0->getPointerOperand(),
5227 LI0->getPointerOperand(),
5237 for (
int Idx : seq<int>(0, VL.
size()))
5247 if (MaskedGatherCost >= VecLdCost &&
5260 bool ProfitableGatherPointers =
5261 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5262 return L->isLoopInvariant(V);
5264 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5265 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5267 (
GEP &&
GEP->getNumOperands() == 2 &&
5268 isa<Constant, Instruction>(
GEP->getOperand(1)));
5275 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5276 ProfitableGatherPointers))
5289 "Expected list of pointer operands.");
5299 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5301 SortedIndices.
clear();
5303 auto Key = std::make_pair(BBs[Cnt + 1],
5307 std::optional<int> Diff = getPointersDiff(
5308 ElemTy, std::get<0>(Base.front()), ElemTy,
5314 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5320 if (Bases.
size() > VL.
size() / 2 - 1)
5324 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5331 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5332 Bases.
front().second.size() == VL.
size()))
5337 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5347 FirstPointers.
insert(P1);
5348 SecondPointers.
insert(P2);
5354 "Unable to find matching root.");
5357 for (
auto &
Base : Bases) {
5358 for (
auto &Vec :
Base.second) {
5359 if (Vec.size() > 1) {
5360 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5361 const std::tuple<Value *, int, unsigned> &
Y) {
5362 return std::get<1>(
X) < std::get<1>(
Y);
5364 int InitialOffset = std::get<1>(Vec[0]);
5365 bool AnyConsecutive =
5367 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5371 if (!AnyConsecutive)
5376 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5380 for (
auto &
T : Bases)
5381 for (
const auto &Vec :
T.second)
5382 for (
const auto &
P : Vec)
5386 "Expected SortedIndices to be the size of VL");
5390std::optional<BoUpSLP::OrdersType>
5392 assert(TE.isGather() &&
"Expected gather node only.");
5393 Type *ScalarTy = TE.Scalars[0]->getType();
5396 Ptrs.
reserve(TE.Scalars.size());
5398 BBs.
reserve(TE.Scalars.size());
5399 for (
Value *V : TE.Scalars) {
5400 auto *L = dyn_cast<LoadInst>(V);
5401 if (!L || !L->isSimple())
5402 return std::nullopt;
5408 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5410 return std::move(Order);
5411 return std::nullopt;
5422 if (VU->
getType() != V->getType())
5425 if (!VU->
hasOneUse() && !V->hasOneUse())
5431 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5437 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5438 bool IsReusedIdx =
false;
5440 if (IE2 == VU && !IE1)
5442 if (IE1 == V && !IE2)
5443 return V->hasOneUse();
5444 if (IE1 && IE1 != V) {
5446 IsReusedIdx |= ReusedIdx.
test(Idx1);
5447 ReusedIdx.
set(Idx1);
5448 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5451 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5453 if (IE2 && IE2 != VU) {
5455 IsReusedIdx |= ReusedIdx.
test(Idx2);
5456 ReusedIdx.
set(Idx2);
5457 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5460 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5462 }
while (!IsReusedIdx && (IE1 || IE2));
5466std::optional<BoUpSLP::OrdersType>
5470 if (!TE.ReuseShuffleIndices.empty()) {
5472 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5473 "Reshuffling scalars not yet supported for nodes with padding");
5476 return std::nullopt;
5484 unsigned Sz = TE.Scalars.size();
5485 if (TE.isGather()) {
5486 if (std::optional<OrdersType> CurrentOrder =
5491 ::addMask(Mask, TE.ReuseShuffleIndices);
5492 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5493 unsigned Sz = TE.Scalars.size();
5494 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5497 Res[
Idx + K * Sz] =
I + K * Sz;
5499 return std::move(Res);
5502 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5504 2 * TE.getVectorFactor())) == 1)
5505 return std::nullopt;
5509 if (TE.ReorderIndices.empty())
5510 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5513 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5514 unsigned VF = ReorderMask.
size();
5518 for (
unsigned I = 0;
I < VF;
I += Sz) {
5520 unsigned UndefCnt = 0;
5521 unsigned Limit = std::min(Sz, VF -
I);
5530 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5532 return std::nullopt;
5534 for (
unsigned K = 0; K < NumParts; ++K) {
5535 unsigned Idx = Val + Sz * K;
5537 ResOrder[
Idx] =
I + K;
5540 return std::move(ResOrder);
5542 unsigned VF = TE.getVectorFactor();
5545 TE.ReuseShuffleIndices.end());
5546 if (TE.getOpcode() == Instruction::ExtractElement &&
5548 if (isa<PoisonValue>(V))
5550 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5551 return Idx && *Idx < Sz;
5553 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5554 "by BinaryOperator and CastInst.");
5556 if (TE.ReorderIndices.empty())
5557 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5560 for (
unsigned I = 0;
I < VF; ++
I) {
5561 int &
Idx = ReusedMask[
I];
5564 Value *V = TE.Scalars[ReorderMask[
Idx]];
5566 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5572 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5573 auto *It = ResOrder.
begin();
5574 for (
unsigned K = 0; K < VF; K += Sz) {
5578 std::iota(SubMask.begin(), SubMask.end(), 0);
5580 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5581 std::advance(It, Sz);
5584 return Data.index() ==
Data.value();
5586 return std::nullopt;
5587 return std::move(ResOrder);
5589 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5590 any_of(TE.UserTreeIndices,
5592 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5594 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5595 return std::nullopt;
5596 if ((TE.State == TreeEntry::Vectorize ||
5597 TE.State == TreeEntry::StridedVectorize) &&
5598 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5599 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5600 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5601 "BinaryOperator and CastInst.");
5602 return TE.ReorderIndices;
5604 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5605 if (!TE.ReorderIndices.empty())
5606 return TE.ReorderIndices;
5609 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5610 if (!V->hasNUsesOrMore(1))
5612 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5617 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5619 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5625 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5626 auto *NodeA = DT->
getNode(BB1);
5627 auto *NodeB = DT->
getNode(BB2);
5628 assert(NodeA &&
"Should only process reachable instructions");
5629 assert(NodeB &&
"Should only process reachable instructions");
5630 assert((NodeA == NodeB) ==
5631 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5632 "Different nodes should have different DFS numbers");
5633 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5635 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5636 Value *V1 = TE.Scalars[I1];
5637 Value *V2 = TE.Scalars[I2];
5638 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0) ||
5639 isa<PoisonValue>(V1) || isa<PoisonValue>(V2))
5645 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5646 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5647 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5648 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5649 FirstUserOfPhi2->getParent());
5650 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5651 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5652 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5653 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5659 if (UserBVHead[I1] && !UserBVHead[I2])
5661 if (!UserBVHead[I1])
5663 if (UserBVHead[I1] == UserBVHead[I2])
5666 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5668 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5675 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5676 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5677 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5678 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5681 if (EE1->getOperand(0) == EE2->getOperand(0))
5683 if (!Inst1 && Inst2)
5685 if (Inst1 && Inst2) {
5693 "Expected either instructions or arguments vector operands.");
5694 return P1->getArgNo() < P2->getArgNo();
5699 std::iota(Phis.
begin(), Phis.
end(), 0);
5702 return std::nullopt;
5703 return std::move(Phis);
5705 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5708 if ((TE.getOpcode() == Instruction::ExtractElement ||
5709 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5710 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5712 auto *EE = dyn_cast<ExtractElementInst>(V);
5713 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5718 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5720 if (Reuse || !CurrentOrder.
empty())
5721 return std::move(CurrentOrder);
5729 int Sz = TE.Scalars.size();
5731 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5733 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5734 if (It == TE.Scalars.begin())
5737 if (It != TE.Scalars.end()) {
5739 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5754 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5757 return std::move(Order);
5762 return std::nullopt;
5763 if (TE.Scalars.size() >= 3)
5768 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5772 CurrentOrder, PointerOps);
5774 return std::move(CurrentOrder);
5780 return CurrentOrder;
5782 return std::nullopt;
5792 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5794 if (Cluster != FirstCluster)
5800void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5803 const unsigned Sz =
TE.Scalars.size();
5805 if (!
TE.isGather() ||
5812 addMask(NewMask,
TE.ReuseShuffleIndices);
5814 TE.ReorderIndices.clear();
5821 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5822 *
End =
TE.ReuseShuffleIndices.end();
5823 It !=
End; std::advance(It, Sz))
5824 std::iota(It, std::next(It, Sz), 0);
5830 "Expected same size of orders");
5831 unsigned Sz = Order.
size();
5833 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5834 if (Order[
Idx] != Sz)
5835 UsedIndices.
set(Order[
Idx]);
5837 if (SecondaryOrder.
empty()) {
5838 for (
unsigned Idx : seq<unsigned>(0, Sz))
5839 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5842 for (
unsigned Idx : seq<unsigned>(0, Sz))
5843 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5844 !UsedIndices.
test(SecondaryOrder[
Idx]))
5845 Order[
Idx] = SecondaryOrder[
Idx];
5865 ExternalUserReorderMap;
5870 const std::unique_ptr<TreeEntry> &TE) {
5873 findExternalStoreUsersReorderIndices(TE.get());
5874 if (!ExternalUserReorderIndices.
empty()) {
5875 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5877 std::move(ExternalUserReorderIndices));
5883 if (TE->isAltShuffle()) {
5886 unsigned Opcode0 = TE->getOpcode();
5887 unsigned Opcode1 = TE->getAltOpcode();
5890 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5891 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5897 if (std::optional<OrdersType> CurrentOrder =
5907 const TreeEntry *UserTE = TE.get();
5909 if (UserTE->UserTreeIndices.size() != 1)
5912 return EI.UserTE->State == TreeEntry::Vectorize &&
5913 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5916 UserTE = UserTE->UserTreeIndices.back().UserTE;
5919 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5920 if (!(TE->State == TreeEntry::Vectorize ||
5921 TE->State == TreeEntry::StridedVectorize) ||
5922 !TE->ReuseShuffleIndices.empty())
5923 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5924 if (TE->State == TreeEntry::Vectorize &&
5925 TE->getOpcode() == Instruction::PHI)
5926 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5931 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
5932 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5933 auto It = VFToOrderedEntries.
find(VF);
5934 if (It == VFToOrderedEntries.
end())
5949 for (
const TreeEntry *OpTE : OrderedEntries) {
5952 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5955 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5957 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5958 auto It = GathersToOrders.find(OpTE);
5959 if (It != GathersToOrders.end())
5962 if (OpTE->isAltShuffle()) {
5963 auto It = AltShufflesToOrders.find(OpTE);
5964 if (It != AltShufflesToOrders.end())
5967 if (OpTE->State == TreeEntry::Vectorize &&
5968 OpTE->getOpcode() == Instruction::PHI) {
5969 auto It = PhisToOrders.
find(OpTE);
5970 if (It != PhisToOrders.
end())
5973 return OpTE->ReorderIndices;
5976 auto It = ExternalUserReorderMap.
find(OpTE);
5977 if (It != ExternalUserReorderMap.
end()) {
5978 const auto &ExternalUserReorderIndices = It->second;
5982 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5983 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5984 ExternalUserReorderIndices.size();
5986 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5987 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5994 if (OpTE->State == TreeEntry::Vectorize &&
5995 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5996 assert(!OpTE->isAltShuffle() &&
5997 "Alternate instructions are only supported by BinaryOperator "
6001 unsigned E = Order.size();
6004 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6007 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6009 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6012 if (OrdersUses.empty())
6015 unsigned IdentityCnt = 0;
6016 unsigned FilledIdentityCnt = 0;
6018 for (
auto &Pair : OrdersUses) {
6020 if (!Pair.first.empty())
6021 FilledIdentityCnt += Pair.second;
6022 IdentityCnt += Pair.second;
6027 unsigned Cnt = IdentityCnt;
6028 for (
auto &Pair : OrdersUses) {
6032 if (Cnt < Pair.second ||
6033 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6034 Cnt == Pair.second && !BestOrder.
empty() &&
6037 BestOrder = Pair.first;
6050 unsigned E = BestOrder.
size();
6052 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6055 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6057 if (TE->Scalars.size() != VF) {
6058 if (TE->ReuseShuffleIndices.size() == VF) {
6064 return EI.UserTE->Scalars.size() == VF ||
6065 EI.UserTE->Scalars.size() ==
6068 "All users must be of VF size.");
6076 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6081 return isa<ShuffleVectorInst>(
6082 EI.UserTE->getMainOp());
6084 "Does not know how to reorder.");
6088 reorderNodeWithReuses(*TE, Mask);
6092 if ((TE->State == TreeEntry::Vectorize ||
6093 TE->State == TreeEntry::StridedVectorize) &&
6096 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6097 assert(!TE->isAltShuffle() &&
6098 "Alternate instructions are only supported by BinaryOperator "
6103 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6104 TE->reorderOperands(Mask);
6107 TE->reorderOperands(Mask);
6108 assert(TE->ReorderIndices.empty() &&
6109 "Expected empty reorder sequence.");
6112 if (!TE->ReuseShuffleIndices.empty()) {
6119 addMask(NewReuses, TE->ReuseShuffleIndices);
6120 TE->ReuseShuffleIndices.swap(NewReuses);
6126bool BoUpSLP::canReorderOperands(
6127 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6130 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6131 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6132 return OpData.first ==
I &&
6133 (OpData.second->State == TreeEntry::Vectorize ||
6134 OpData.second->State == TreeEntry::StridedVectorize);
6137 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6139 if (
any_of(TE->UserTreeIndices,
6140 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6144 Edges.emplace_back(
I, TE);
6150 if (TE->State != TreeEntry::Vectorize &&
6151 TE->State != TreeEntry::StridedVectorize &&
6152 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6156 TreeEntry *
Gather =
nullptr;
6158 [&
Gather, UserTE,
I](TreeEntry *TE) {
6159 assert(TE->State != TreeEntry::Vectorize &&
6160 TE->State != TreeEntry::StridedVectorize &&
6161 "Only non-vectorized nodes are expected.");
6162 if (
any_of(TE->UserTreeIndices,
6163 [UserTE,
I](
const EdgeInfo &EI) {
6164 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6166 assert(TE->isSame(UserTE->getOperand(
I)) &&
6167 "Operand entry does not match operands.");
6188 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6189 if (TE->State != TreeEntry::Vectorize &&
6190 TE->State != TreeEntry::StridedVectorize)
6192 if (std::optional<OrdersType> CurrentOrder =
6194 OrderedEntries.
insert(TE.get());
6195 if (!(TE->State == TreeEntry::Vectorize ||
6196 TE->State == TreeEntry::StridedVectorize) ||
6197 !TE->ReuseShuffleIndices.empty())
6198 GathersToOrders.
insert(TE.get());
6207 while (!OrderedEntries.
empty()) {
6212 for (TreeEntry *TE : OrderedEntries) {
6213 if (!(TE->State == TreeEntry::Vectorize ||
6214 TE->State == TreeEntry::StridedVectorize ||
6215 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6216 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6219 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6221 !Visited.
insert(TE).second) {
6227 for (
EdgeInfo &EI : TE->UserTreeIndices)
6231 for (TreeEntry *TE : Filtered)
6232 OrderedEntries.remove(TE);
6234 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6236 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6237 return Data1.first->Idx > Data2.first->Idx;
6239 for (
auto &
Data : UsersVec) {
6242 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6244 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6245 OrderedEntries.remove(
Op.second);
6258 for (
const auto &
Op :
Data.second) {
6259 TreeEntry *OpTE =
Op.second;
6260 if (!VisitedOps.
insert(OpTE).second)
6262 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6264 const auto Order = [&]() ->
const OrdersType {
6265 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6268 return OpTE->ReorderIndices;
6272 if (Order.size() == 1)
6275 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6276 return P.second == OpTE;
6279 if (OpTE->State == TreeEntry::Vectorize &&
6280 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6281 assert(!OpTE->isAltShuffle() &&
6282 "Alternate instructions are only supported by BinaryOperator "
6286 unsigned E = Order.size();
6289 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6292 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6295 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6297 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6298 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6299 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6300 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6301 (IgnoreReorder && TE->Idx == 0))
6303 if (TE->isGather()) {
6312 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6313 TreeEntry *UserTE = EI.
UserTE;
6314 if (!VisitedUsers.
insert(UserTE).second)
6319 if (AllowsReordering(UserTE))
6327 if (
static_cast<unsigned>(
count_if(
6328 Ops, [UserTE, &AllowsReordering](
6329 const std::pair<unsigned, TreeEntry *> &
Op) {
6330 return AllowsReordering(
Op.second) &&
6333 return EI.UserTE == UserTE;
6335 })) <= Ops.
size() / 2)
6336 ++Res.first->second;
6339 if (OrdersUses.empty()) {
6340 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6341 OrderedEntries.remove(
Op.second);
6345 unsigned IdentityCnt = 0;
6346 unsigned VF =
Data.second.front().second->getVectorFactor();
6348 for (
auto &Pair : OrdersUses) {
6350 IdentityCnt += Pair.second;
6355 unsigned Cnt = IdentityCnt;
6356 for (
auto &Pair : OrdersUses) {
6360 if (Cnt < Pair.second) {
6362 BestOrder = Pair.first;
6370 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6371 OrderedEntries.remove(
Op.second);
6380 unsigned E = BestOrder.
size();
6382 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6384 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6385 TreeEntry *TE =
Op.second;
6386 OrderedEntries.remove(TE);
6387 if (!VisitedOps.
insert(TE).second)
6389 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6390 reorderNodeWithReuses(*TE, Mask);
6394 if (TE->State != TreeEntry::Vectorize &&
6395 TE->State != TreeEntry::StridedVectorize &&
6396 (TE->State != TreeEntry::ScatterVectorize ||
6397 TE->ReorderIndices.empty()))
6399 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6400 TE->ReorderIndices.empty()) &&
6401 "Non-matching sizes of user/operand entries.");
6403 if (IgnoreReorder && TE == VectorizableTree.front().get())
6404 IgnoreReorder =
false;
6407 for (TreeEntry *
Gather : GatherOps) {
6409 "Unexpected reordering of gathers.");
6410 if (!
Gather->ReuseShuffleIndices.empty()) {
6416 OrderedEntries.remove(
Gather);
6420 if (
Data.first->State != TreeEntry::Vectorize ||
6421 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6422 Data.first->getMainOp()) ||
6423 Data.first->isAltShuffle())
6424 Data.first->reorderOperands(Mask);
6425 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6426 Data.first->isAltShuffle() ||
6427 Data.first->State == TreeEntry::StridedVectorize) {
6431 if (
Data.first->ReuseShuffleIndices.empty() &&
6432 !
Data.first->ReorderIndices.empty() &&
6433 !
Data.first->isAltShuffle()) {
6436 OrderedEntries.insert(
Data.first);
6444 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6445 VectorizableTree.front()->ReuseShuffleIndices.empty())
6446 VectorizableTree.front()->ReorderIndices.clear();
6449Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6450 if ((Entry.getOpcode() == Instruction::Store ||
6451 Entry.getOpcode() == Instruction::Load) &&
6452 Entry.State == TreeEntry::StridedVectorize &&
6453 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6454 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6455 return dyn_cast<Instruction>(Entry.Scalars.front());
6462 for (
auto &TEPtr : VectorizableTree) {
6463 TreeEntry *Entry = TEPtr.get();
6466 if (Entry->isGather())
6470 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6471 Value *Scalar = Entry->Scalars[Lane];
6472 if (!isa<Instruction>(Scalar))
6475 auto It = ScalarToExtUses.
find(Scalar);
6476 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6480 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6481 if (ExtI != ExternallyUsedValues.
end()) {
6482 int FoundLane = Entry->findLaneForValue(Scalar);
6483 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6484 << FoundLane <<
" from " << *Scalar <<
".\n");
6485 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6486 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6489 for (
User *U : Scalar->users()) {
6497 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6501 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6505 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6507 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6508 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6510 assert(!UseEntry->isGather() &&
"Bad state");
6514 if (It != ScalarToExtUses.
end()) {
6515 ExternalUses[It->second].User =
nullptr;
6520 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6522 int FoundLane = Entry->findLaneForValue(Scalar);
6524 <<
" from lane " << FoundLane <<
" from " << *Scalar
6526 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6527 ExternalUses.emplace_back(Scalar, U, FoundLane);
6536BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6540 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6541 Value *V = TE->Scalars[Lane];
6543 if (!isa<Instruction>(V))
6550 for (
User *U : V->users()) {
6551 auto *SI = dyn_cast<StoreInst>(U);
6554 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6558 if (getTreeEntry(U))
6563 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6564 SI->getValueOperand()->getType(),
Ptr}];
6567 if (StoresVec.size() > Lane)
6569 if (!StoresVec.empty()) {
6571 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6572 SI->getValueOperand()->getType(),
6573 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6579 StoresVec.push_back(SI);
6584 for (
auto &
P : PtrToStoresMap) {
6585 Res[
I].swap(
P.second);
6592 OrdersType &ReorderIndices)
const {
6603 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6605 std::optional<int> Diff =
6607 SI->getPointerOperand(), *
DL, *SE,
6613 if (StoreOffsetVec.
size() != StoresVec.
size())
6615 sort(StoreOffsetVec,
6616 [](
const std::pair<int, unsigned> &L,
6617 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6620 for (
const auto &
P : StoreOffsetVec) {
6621 if (
Idx > 0 &&
P.first != PrevDist + 1)
6629 ReorderIndices.assign(StoresVec.
size(), 0);
6630 bool IsIdentity =
true;
6632 ReorderIndices[
P.second] =
I;
6633 IsIdentity &=
P.second ==
I;
6639 ReorderIndices.clear();
6646 for (
unsigned Idx : Order)
6653BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6654 unsigned NumLanes =
TE->Scalars.size();
6667 if (StoresVec.
size() != NumLanes)
6672 if (!canFormVector(StoresVec, ReorderIndices))
6677 ExternalReorderIndices.
push_back(ReorderIndices);
6679 return ExternalReorderIndices;
6685 UserIgnoreList = &UserIgnoreLst;
6688 buildTree_rec(Roots, 0,
EdgeInfo());
6695 buildTree_rec(Roots, 0,
EdgeInfo());
6704 bool AddNew =
true) {
6712 for (
Value *V : VL) {
6713 auto *LI = dyn_cast<LoadInst>(V);
6716 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6718 bool IsFound =
false;
6719 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6720 assert(LI->getParent() ==
Data.front().first->getParent() &&
6721 LI->getType() ==
Data.front().first->getType() &&
6725 "Expected loads with the same type, same parent and same "
6726 "underlying pointer.");
6728 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6729 Data.front().first->getPointerOperand(),
DL, SE,
6733 auto It = Map.find(*Dist);
6734 if (It != Map.end() && It->second != LI)
6736 if (It == Map.end()) {
6737 Data.emplace_back(LI, *Dist);
6738 Map.try_emplace(*Dist, LI);
6748 auto FindMatchingLoads =
6753 int &
Offset,
unsigned &Start) {
6755 return GatheredLoads.
end();
6765 std::optional<int> Dist =
6767 Data.front().first->getType(),
6768 Data.front().first->getPointerOperand(),
DL, SE,
6774 for (std::pair<LoadInst *, int>
P :
Data) {
6780 unsigned NumUniques = 0;
6781 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6782 bool Used = DataLoads.
contains(Pair.first);
6783 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6787 Repeated.insert(Cnt);
6790 if (NumUniques > 0 &&
6791 (Loads.
size() == NumUniques ||
6792 (Loads.
size() - NumUniques >= 2 &&
6793 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6799 return std::next(GatheredLoads.
begin(),
Idx);
6803 return GatheredLoads.
end();
6805 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6809 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6811 while (It != GatheredLoads.
end()) {
6812 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6813 for (
unsigned Idx : LocalToAdd)
6815 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6816 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6820 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6824 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6833 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6834 return PD.front().first->getParent() == LI->
getParent() &&
6835 PD.front().first->getType() == LI->
getType();
6837 while (It != GatheredLoads.
end()) {
6840 std::next(It), GatheredLoads.
end(),
6841 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6842 return PD.front().first->getParent() == LI->getParent() &&
6843 PD.front().first->getType() == LI->getType();
6847 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6848 AddNewLoads(GatheredLoads.emplace_back());
6853void BoUpSLP::tryToVectorizeGatheredLoads(
6856 8> &GatheredLoads) {
6857 GatheredLoadsEntriesFirst = VectorizableTree.size();
6860 LoadEntriesToVectorize.
size());
6861 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6862 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6863 VectorizableTree[
Idx]->Scalars.end());
6866 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6867 const std::pair<LoadInst *, int> &L2) {
6868 return L1.second > L2.second;
6874 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6875 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6883 bool Final,
unsigned MaxVF) {
6885 unsigned StartIdx = 0;
6890 *
TTI, Loads.
front()->getType(), MaxVF);
6892 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6898 if (Final && CandidateVFs.
empty())
6901 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6902 for (
unsigned NumElts : CandidateVFs) {
6903 if (Final && NumElts > BestVF)
6906 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6910 if (VectorizedLoads.count(Slice.
front()) ||
6911 VectorizedLoads.count(Slice.
back()) ||
6917 bool AllowToVectorize =
false;
6925 if (LI->hasOneUse())
6931 if (
static_cast<unsigned int>(std::distance(
6932 LI->user_begin(), LI->user_end())) != LI->getNumUses())
6934 if (!IsLegalBroadcastLoad)
6938 for (
User *U : LI->users()) {
6939 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
6941 if (
const TreeEntry *UTE = getTreeEntry(U)) {
6942 for (
int I : seq<int>(UTE->getNumOperands())) {
6943 if (
all_of(UTE->getOperand(
I),
6944 [LI](
Value *V) { return V == LI; }))
6953 AllowToVectorize = CheckIfAllowed(Slice);
6957 any_of(ValueToGatherNodes.at(Slice.front()),
6958 [=](
const TreeEntry *TE) {
6959 return TE->Scalars.size() == 2 &&
6960 ((TE->Scalars.front() == Slice.front() &&
6961 TE->Scalars.back() == Slice.back()) ||
6962 (TE->Scalars.front() == Slice.back() &&
6963 TE->Scalars.back() == Slice.front()));
6968 if (AllowToVectorize) {
6973 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
6975 PointerOps, &BestVF);
6977 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6979 if (MaskedGatherVectorized.
empty() ||
6980 Cnt >= MaskedGatherVectorized.
back() + NumElts)
6985 Results.emplace_back(Values, LS);
6986 VectorizedLoads.insert(Slice.begin(), Slice.end());
6989 if (Cnt == StartIdx)
6990 StartIdx += NumElts;
6993 if (StartIdx >= Loads.
size())
6997 if (!MaskedGatherVectorized.
empty() &&
6998 Cnt < MaskedGatherVectorized.
back() + NumElts)
7004 if (!AllowToVectorize || BestVF == 0)
7008 for (
unsigned Cnt : MaskedGatherVectorized) {
7010 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7014 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7016 if (Cnt == StartIdx)
7017 StartIdx += NumElts;
7021 if (!VectorizedLoads.contains(LI))
7022 NonVectorized.push_back(LI);
7026 auto ProcessGatheredLoads =
7029 bool Final =
false) {
7031 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7032 if (LoadsDists.size() <= 1) {
7033 NonVectorized.
push_back(LoadsDists.back().first);
7039 LoadsDists, OriginalLoads.begin(),
7040 [](
const std::pair<LoadInst *, int> &L) { return L.first; });
7043 unsigned MaxConsecutiveDistance = 0;
7044 unsigned CurrentConsecutiveDist = 1;
7045 int LastDist = LocalLoadsDists.
front().second;
7046 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7047 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7048 if (getTreeEntry(
L.first))
7050 assert(LastDist >=
L.second &&
7051 "Expected first distance always not less than second");
7052 if (
static_cast<unsigned>(LastDist -
L.second) ==
7053 CurrentConsecutiveDist) {
7054 ++CurrentConsecutiveDist;
7055 MaxConsecutiveDistance =
7056 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7060 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7063 CurrentConsecutiveDist = 1;
7064 LastDist =
L.second;
7067 if (Loads.
size() <= 1)
7069 if (AllowMaskedGather)
7070 MaxConsecutiveDistance = Loads.
size();
7071 else if (MaxConsecutiveDistance < 2)
7076 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7077 Final, MaxConsecutiveDistance);
7079 OriginalLoads.size() == Loads.
size() &&
7080 MaxConsecutiveDistance == Loads.
size() &&
7085 VectorizedLoads.
clear();
7089 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7090 UnsortedNonVectorized, Final,
7091 OriginalLoads.size());
7092 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7093 SortedNonVectorized.
swap(UnsortedNonVectorized);
7094 Results.swap(UnsortedResults);
7099 << Slice.
size() <<
")\n");
7100 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7101 for (
Value *L : Slice)
7102 if (!getTreeEntry(L))
7103 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7109 unsigned MaxVF = Slice.size();
7110 unsigned UserMaxVF = 0;
7111 unsigned InterleaveFactor = 0;
7116 std::optional<unsigned> InterleavedLoadsDistance = 0;
7118 std::optional<unsigned> CommonVF = 0;
7122 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7123 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7126 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7128 if (*CommonVF == 0) {
7129 CommonVF =
E->Scalars.size();
7132 if (*CommonVF !=
E->Scalars.size())
7136 if (Pos !=
Idx && InterleavedLoadsDistance) {
7139 if (isa<Constant>(V))
7141 if (getTreeEntry(V))
7143 const auto &Nodes = ValueToGatherNodes.at(V);
7144 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7145 !is_contained(Slice, V);
7147 InterleavedLoadsDistance.reset();
7151 if (*InterleavedLoadsDistance == 0) {
7152 InterleavedLoadsDistance =
Idx - Pos;
7155 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7156 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7157 InterleavedLoadsDistance.reset();
7158 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7162 DeinterleavedNodes.
clear();
7164 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7165 CommonVF.value_or(0) != 0) {
7166 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7167 unsigned VF = *CommonVF;
7171 if (InterleaveFactor <= Slice.size() &&
7175 cast<LoadInst>(Slice.front())->getAlign(),
7176 cast<LoadInst>(Slice.front())
7180 UserMaxVF = InterleaveFactor * VF;
7182 InterleaveFactor = 0;
7187 unsigned ConsecutiveNodesSize = 0;
7188 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7189 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7190 [&, Slice = Slice](
const auto &
P) {
7192 return std::get<1>(
P).contains(V);
7194 if (It == Slice.end())
7197 VectorizableTree[std::get<0>(
P)]->Scalars;
7198 ConsecutiveNodesSize += VL.
size();
7199 unsigned Start = std::distance(Slice.begin(), It);
7200 unsigned Sz = Slice.size() - Start;
7201 return Sz < VL.
size() ||
7202 Slice.slice(std::distance(Slice.begin(), It),
7208 if (InterleaveFactor == 0 &&
7209 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7210 [&, Slice = Slice](
unsigned Idx) {
7212 SmallVector<Value *> PointerOps;
7213 return canVectorizeLoads(
7214 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7215 Slice[Idx * UserMaxVF], Order,
7217 LoadsState::ScatterVectorize;
7220 if (Slice.size() != ConsecutiveNodesSize)
7221 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7223 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7224 bool IsVectorized =
true;
7225 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7227 Slice.
slice(
I, std::min(VF,
E -
I));
7228 if (getTreeEntry(SubSlice.
front()))
7232 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7233 [&](
const auto &
P) {
7235 VectorizableTree[std::get<0>(
P)]
7240 unsigned Sz = VectorizableTree.size();
7241 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7242 if (Sz == VectorizableTree.size()) {
7243 IsVectorized =
false;
7246 if (InterleaveFactor > 0) {
7247 VF = 2 * (MaxVF / InterleaveFactor);
7248 InterleaveFactor = 0;
7257 NonVectorized.
append(SortedNonVectorized);
7259 return NonVectorized;
7261 for (
const auto &GLs : GatheredLoads) {
7262 const auto &
Ref = GLs.second;
7264 if (!
Ref.empty() && !NonVectorized.
empty() &&
7266 Ref.begin(),
Ref.end(), 0u,
7267 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
7268 return S + LoadsDists.size();
7269 }) != NonVectorized.
size() &&
7270 IsMaskedGatherSupported(NonVectorized)) {
7272 for (
LoadInst *LI : NonVectorized) {
7280 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7284 for (
unsigned Idx : LoadEntriesToVectorize) {
7285 const TreeEntry &
E = *VectorizableTree[
Idx];
7288 if (!
E.ReorderIndices.empty()) {
7295 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7299 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7300 VectorizableTree.size())
7301 GatheredLoadsEntriesFirst.reset();
7308 Value *NeedsScheduling =
nullptr;
7309 for (
Value *V : VL) {
7312 if (!NeedsScheduling) {
7313 NeedsScheduling = V;
7318 return NeedsScheduling;
7329 bool AllowAlternate) {
7333 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7336 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7341 if (isa<ExtractElementInst, UndefValue>(V))
7343 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7345 !isa<UndefValue>(EI->getIndexOperand()))
7348 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7351 if ((isa<BinaryOperator, CastInst>(
I)) &&
7361 : cast<CastInst>(
I)->getOperand(0)->getType()));
7363 if (isa<CastInst>(
I)) {
7364 std::pair<size_t, size_t> OpVals =
7370 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7372 if (CI->isCommutative())
7378 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7392 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7393 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7394 SubKey =
hash_value(Gep->getPointerOperand());
7398 !isa<ConstantInt>(
I->getOperand(1))) {
7406 return std::make_pair(Key, SubKey);
7416bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7418 unsigned Opcode0 = S.getOpcode();
7419 unsigned Opcode1 = S.getAltOpcode();
7423 Opcode0, Opcode1, OpcodeMask))
7426 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7429 for (
Value *V : VL) {
7430 if (isa<PoisonValue>(V)) {
7435 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7440 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7446 switch (Res.value_or(0)) {
7461 constexpr unsigned NumAltInsts = 3;
7462 unsigned NonInstCnt = 0;
7465 unsigned UndefCnt = 0;
7467 unsigned ExtraShuffleInsts = 0;
7476 return is_contained(Operands.back(), V);
7479 ++ExtraShuffleInsts;
7496 if (isa<Constant, ExtractElementInst>(V) ||
7497 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7498 if (isa<UndefValue>(V))
7504 if (!Res.second && Res.first->second == 1)
7505 ++ExtraShuffleInsts;
7506 ++Res.first->getSecond();
7507 if (
auto *
I = dyn_cast<Instruction>(V))
7508 UniqueOpcodes.
insert(
I->getOpcode());
7509 else if (Res.second)
7512 return none_of(Uniques, [&](
const auto &
P) {
7513 return P.first->hasNUsesOrMore(
P.second + 1) &&
7515 return getTreeEntry(U) || Uniques.contains(U);
7524 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7525 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7526 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7529BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7531 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7534 "Expected instructions with same/alternate opcodes only.");
7536 unsigned ShuffleOrOp =
7537 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7539 switch (ShuffleOrOp) {
7540 case Instruction::PHI: {
7543 return TreeEntry::NeedToGather;
7545 for (
Value *V : VL) {
7546 auto *
PHI = dyn_cast<PHINode>(V);
7551 if (Term &&
Term->isTerminator()) {
7553 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7554 return TreeEntry::NeedToGather;
7559 return TreeEntry::Vectorize;
7561 case Instruction::ExtractValue:
7562 case Instruction::ExtractElement: {
7563 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7566 return TreeEntry::NeedToGather;
7567 if (Reuse || !CurrentOrder.empty())
7568 return TreeEntry::Vectorize;
7570 return TreeEntry::NeedToGather;
7572 case Instruction::InsertElement: {
7576 for (
Value *V : VL) {
7577 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7579 "Non-constant or undef index?");
7583 return !SourceVectors.contains(V);
7586 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7587 "different source vectors.\n");
7588 return TreeEntry::NeedToGather;
7593 return SourceVectors.contains(V) && !
V->hasOneUse();
7596 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7597 "multiple uses.\n");
7598 return TreeEntry::NeedToGather;
7601 return TreeEntry::Vectorize;
7603 case Instruction::Load: {
7612 return TreeEntry::Vectorize;
7614 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7616 LoadEntriesToVectorize.insert(VectorizableTree.size());
7617 return TreeEntry::NeedToGather;
7619 return TreeEntry::ScatterVectorize;
7621 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7623 LoadEntriesToVectorize.insert(VectorizableTree.size());
7624 return TreeEntry::NeedToGather;
7626 return TreeEntry::StridedVectorize;
7630 if (
DL->getTypeSizeInBits(ScalarTy) !=
7631 DL->getTypeAllocSizeInBits(ScalarTy))
7632 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7634 auto *LI = dyn_cast<LoadInst>(V);
7635 return !LI || !LI->isSimple();
7642 return TreeEntry::NeedToGather;
7646 case Instruction::ZExt:
7647 case Instruction::SExt:
7648 case Instruction::FPToUI:
7649 case Instruction::FPToSI:
7650 case Instruction::FPExt:
7651 case Instruction::PtrToInt:
7652 case Instruction::IntToPtr:
7653 case Instruction::SIToFP:
7654 case Instruction::UIToFP:
7655 case Instruction::Trunc:
7656 case Instruction::FPTrunc:
7657 case Instruction::BitCast: {
7659 for (
Value *V : VL) {
7660 if (isa<PoisonValue>(V))
7662 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7665 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7666 return TreeEntry::NeedToGather;
7669 return TreeEntry::Vectorize;
7671 case Instruction::ICmp:
7672 case Instruction::FCmp: {
7677 for (
Value *V : VL) {
7678 if (isa<PoisonValue>(V))
7680 auto *
Cmp = cast<CmpInst>(V);
7681 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7682 Cmp->getOperand(0)->getType() != ComparedTy) {
7683 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7684 return TreeEntry::NeedToGather;
7687 return TreeEntry::Vectorize;
7689 case Instruction::Select:
7690 case Instruction::FNeg:
7691 case Instruction::Add:
7692 case Instruction::FAdd:
7693 case Instruction::Sub:
7694 case Instruction::FSub:
7695 case Instruction::Mul:
7696 case Instruction::FMul:
7697 case Instruction::UDiv:
7698 case Instruction::SDiv:
7699 case Instruction::FDiv:
7700 case Instruction::URem:
7701 case Instruction::SRem:
7702 case Instruction::FRem:
7703 case Instruction::Shl:
7704 case Instruction::LShr:
7705 case Instruction::AShr:
7706 case Instruction::And:
7707 case Instruction::Or:
7708 case Instruction::Xor:
7709 case Instruction::Freeze:
7710 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7712 auto *
I = dyn_cast<Instruction>(V);
7713 return I &&
I->isBinaryOp() && !
I->isFast();
7715 return TreeEntry::NeedToGather;
7716 return TreeEntry::Vectorize;
7717 case Instruction::GetElementPtr: {
7719 for (
Value *V : VL) {
7720 auto *
I = dyn_cast<GetElementPtrInst>(V);
7723 if (
I->getNumOperands() != 2) {
7724 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7725 return TreeEntry::NeedToGather;
7731 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7732 for (
Value *V : VL) {
7733 auto *
GEP = dyn_cast<GEPOperator>(V);
7736 Type *CurTy =
GEP->getSourceElementType();
7738 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7739 return TreeEntry::NeedToGather;
7745 for (
Value *V : VL) {
7746 auto *
I = dyn_cast<GetElementPtrInst>(V);
7749 auto *
Op =
I->getOperand(1);
7750 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7751 (
Op->getType() != Ty1 &&
7752 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7753 Op->getType()->getScalarSizeInBits() >
7754 DL->getIndexSizeInBits(
7755 V->getType()->getPointerAddressSpace())))) {
7757 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7758 return TreeEntry::NeedToGather;
7762 return TreeEntry::Vectorize;
7764 case Instruction::Store: {
7766 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7769 if (
DL->getTypeSizeInBits(ScalarTy) !=
7770 DL->getTypeAllocSizeInBits(ScalarTy)) {
7771 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7772 return TreeEntry::NeedToGather;
7776 for (
Value *V : VL) {
7777 auto *
SI = cast<StoreInst>(V);
7778 if (!
SI->isSimple()) {
7780 return TreeEntry::NeedToGather;
7789 if (CurrentOrder.empty()) {
7790 Ptr0 = PointerOps.
front();
7791 PtrN = PointerOps.
back();
7793 Ptr0 = PointerOps[CurrentOrder.front()];
7794 PtrN = PointerOps[CurrentOrder.back()];
7796 std::optional<int> Dist =
7799 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7800 return TreeEntry::Vectorize;
7804 return TreeEntry::NeedToGather;
7806 case Instruction::Call: {
7807 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7809 auto *
I = dyn_cast<Instruction>(V);
7810 return I && !
I->isFast();
7812 return TreeEntry::NeedToGather;
7815 CallInst *CI = cast<CallInst>(VL0);
7826 return TreeEntry::NeedToGather;
7831 for (
unsigned J = 0; J != NumArgs; ++J)
7834 for (
Value *V : VL) {
7835 CallInst *CI2 = dyn_cast<CallInst>(V);
7841 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7843 return TreeEntry::NeedToGather;
7847 for (
unsigned J = 0; J != NumArgs; ++J) {
7850 if (ScalarArgs[J] != A1J) {
7852 <<
"SLP: mismatched arguments in call:" << *CI
7853 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7854 return TreeEntry::NeedToGather;
7863 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7864 <<
"!=" << *V <<
'\n');
7865 return TreeEntry::NeedToGather;
7869 return TreeEntry::Vectorize;
7871 case Instruction::ShuffleVector: {
7872 if (!S.isAltShuffle()) {
7875 return TreeEntry::Vectorize;
7878 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7879 return TreeEntry::NeedToGather;
7884 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7885 "the whole alt sequence is not profitable.\n");
7886 return TreeEntry::NeedToGather;
7889 return TreeEntry::Vectorize;
7893 return TreeEntry::NeedToGather;
7907 PHIHandler() =
delete;
7909 : DT(DT), Main(Main), Phis(Phis),
7910 Operands(Main->getNumIncomingValues(),
7912 void buildOperands() {
7913 constexpr unsigned FastLimit = 4;
7923 auto *
P = dyn_cast<PHINode>(V);
7925 assert(isa<PoisonValue>(V) &&
7926 "Expected isa instruction or poison value.");
7930 if (
P->getIncomingBlock(
I) == InBB)
7945 Blocks.try_emplace(InBB).first->second.push_back(
I);
7948 if (isa<PoisonValue>(V)) {
7953 auto *
P = cast<PHINode>(V);
7954 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
7962 auto It =
Blocks.find(InBB);
7968 for (
const auto &
P :
Blocks) {
7969 if (
P.getSecond().size() <= 1)
7971 unsigned BasicI =
P.getSecond().front();
7974 [&](
const auto &Data) {
7975 return !Data.value() ||
7976 Data.value() ==
Operands[BasicI][Data.index()];
7978 "Expected empty operands list.");
7988 const EdgeInfo &UserTreeIdx,
7989 unsigned InterleaveFactor) {
7995 auto TryToFindDuplicates = [&](
const InstructionsState &S,
7996 bool DoNotFail =
false) {
7999 for (
Value *V : VL) {
8006 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8011 size_t NumUniqueScalarValues = UniqueValues.
size();
8014 if (NumUniqueScalarValues == VL.size() &&
8016 ReuseShuffleIndices.
clear();
8019 if ((UserTreeIdx.UserTE &&
8020 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8022 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8023 "for nodes with padding.\n");
8024 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8028 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8029 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8032 if (DoNotFail && UniquePositions.size() > 1 &&
8033 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8034 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8037 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8038 if (PWSz == VL.size()) {
8039 ReuseShuffleIndices.
clear();
8041 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8043 PWSz - UniqueValues.
size(),
8045 VL = NonUniqueValueVL;
8050 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8063 if (S.getMainOp() &&
8064 isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8066 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8071 if (S.getOpcode()) {
8072 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8073 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp()
8075 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8076 auto It = MultiNodeScalars.
find(S.getMainOp());
8077 if (It != MultiNodeScalars.
end()) {
8078 auto *TEIt =
find_if(It->getSecond(),
8079 [&](TreeEntry *ME) { return ME->isSame(VL); });
8080 if (TEIt != It->getSecond().end())
8090 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
8091 if (TryToFindDuplicates(S))
8092 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8093 ReuseShuffleIndices);
8097 Nodes.
insert(getTreeEntry(S.getMainOp()));
8098 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8101 if (
any_of(Nodes, [&](
const TreeEntry *E) {
8103 [&](
Value *V) { return Values.contains(V); }))
8108 all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8111 if (TryToFindDuplicates(S))
8112 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8113 ReuseShuffleIndices);
8120 E->UserTreeIndices.push_back(UserTreeIdx);
8121 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8132 !(S.getMainOp() && !S.isAltShuffle() && VL.size() >= 4 &&
8137 cast<Instruction>(
I)->getOpcode() ==
8138 S.getMainOp()->getOpcode();
8140 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8141 if (TryToFindDuplicates(S))
8142 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8143 ReuseShuffleIndices);
8148 if (S.getOpcode() == Instruction::ExtractElement &&
8149 isa<ScalableVectorType>(
8150 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8151 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8152 if (TryToFindDuplicates(S))
8153 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8154 ReuseShuffleIndices);
8161 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8170 auto &&NotProfitableForVectorization = [&S,
this,
8172 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
8181 for (
Value *V : VL) {
8182 auto *
I = cast<Instruction>(V);
8184 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8187 bool IsCommutative =
8189 if ((IsCommutative &&
8190 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8192 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8194 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8196 auto *
I1 = cast<Instruction>(VL.front());
8197 auto *I2 = cast<Instruction>(VL.back());
8198 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8200 I2->getOperand(
Op));
8201 if (
static_cast<unsigned>(
count_if(
8202 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8204 })) >= S.getMainOp()->getNumOperands() / 2)
8206 if (S.getMainOp()->getNumOperands() > 2)
8208 if (IsCommutative) {
8213 I2->getOperand((
Op + 1) % E));
8215 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8224 bool IsScatterVectorizeUserTE =
8225 UserTreeIdx.UserTE &&
8226 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8227 bool AreAllSameBlock = S.getOpcode() &&
allSameBlock(VL);
8228 bool AreScatterAllGEPSameBlock =
8229 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8233 auto *
I = dyn_cast<GetElementPtrInst>(V);
8237 BB =
I->getParent();
8238 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8241 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8243 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8245 (isa_and_present<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8248 NotProfitableForVectorization(VL)) {
8249 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8250 if (TryToFindDuplicates(S))
8251 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8252 ReuseShuffleIndices);
8257 if (S.getOpcode() && !EphValues.
empty()) {
8258 for (
Value *V : VL) {
8259 if (EphValues.
count(V)) {
8261 <<
") is ephemeral.\n");
8262 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8272 for (
Value *V : VL) {
8273 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8276 if (getTreeEntry(V)) {
8278 <<
") is already in tree.\n");
8279 if (TryToFindDuplicates(S))
8280 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8281 ReuseShuffleIndices);
8287 if (UserIgnoreList && !UserIgnoreList->empty()) {
8288 for (
Value *V : VL) {
8289 if (UserIgnoreList->contains(V)) {
8290 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8291 if (TryToFindDuplicates(S))
8292 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8293 ReuseShuffleIndices);
8301 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8302 assert(VL.front()->getType()->isPointerTy() &&
8303 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8304 "Expected pointers only.");
8306 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8307 assert(It != VL.end() &&
"Expected at least one GEP.");
8316 if (S.getMainOp() &&
8324 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8329 if (!TryToFindDuplicates(S,
true))
8335 TreeEntry::EntryState State = getScalarsVectorizationState(
8336 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8337 if (State == TreeEntry::NeedToGather) {
8338 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8339 ReuseShuffleIndices);
8343 auto &BSRef = BlocksSchedules[BB];
8345 BSRef = std::make_unique<BlockScheduling>(BB);
8347 BlockScheduling &BS = *BSRef;
8349 std::optional<ScheduleData *> Bundle =
8350 BS.tryScheduleBundle(UniqueValues,
this, S);
8351#ifdef EXPENSIVE_CHECKS
8356 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8357 assert((!BS.getScheduleData(VL0) ||
8358 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8359 "tryScheduleBundle should cancelScheduling on failure");
8360 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8361 ReuseShuffleIndices);
8362 NonScheduledFirst.insert(VL.front());
8363 if (S.getOpcode() == Instruction::Load &&
8364 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8368 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8370 unsigned ShuffleOrOp = S.isAltShuffle() ?
8371 (
unsigned) Instruction::ShuffleVector : S.getOpcode();
8372 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8375 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8380 if (S.getOpcode() != Instruction::PHI || S.isAltShuffle())
8385 for (
unsigned I : PHIOps)
8388 switch (ShuffleOrOp) {
8389 case Instruction::PHI: {
8390 auto *PH = cast<PHINode>(VL0);
8393 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8397 PHIHandler Handler(*DT, PH, VL);
8398 Handler.buildOperands();
8399 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8400 TE->setOperand(
I, Handler.getOperands(
I));
8402 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8407 case Instruction::ExtractValue:
8408 case Instruction::ExtractElement: {
8409 if (CurrentOrder.empty()) {
8410 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8413 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8415 for (
unsigned Idx : CurrentOrder)
8423 newTreeEntry(VL, Bundle , S, UserTreeIdx,
8424 ReuseShuffleIndices, CurrentOrder);
8429 VectorizableTree.back()->setOperand(0, Op0);
8432 case Instruction::InsertElement: {
8433 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8435 auto OrdCompare = [](
const std::pair<int, int> &P1,
8436 const std::pair<int, int> &P2) {
8437 return P1.first > P2.first;
8440 decltype(OrdCompare)>
8441 Indices(OrdCompare);
8442 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8444 Indices.emplace(
Idx,
I);
8446 OrdersType CurrentOrder(VL.size(), VL.size());
8447 bool IsIdentity =
true;
8448 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8449 CurrentOrder[Indices.top().second] =
I;
8450 IsIdentity &= Indices.top().second ==
I;
8454 CurrentOrder.clear();
8455 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8459 TE->setOperand(*
this);
8460 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8463 case Instruction::Load: {
8470 TreeEntry *
TE =
nullptr;
8473 case TreeEntry::Vectorize:
8474 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8475 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8476 if (CurrentOrder.empty())
8481 case TreeEntry::StridedVectorize:
8483 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8484 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8487 case TreeEntry::ScatterVectorize:
8489 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8490 UserTreeIdx, ReuseShuffleIndices);
8491 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
8493 case TreeEntry::CombinedVectorize:
8494 case TreeEntry::NeedToGather:
8497 TE->setOperand(*
this);
8498 if (State == TreeEntry::ScatterVectorize)
8499 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8502 case Instruction::ZExt:
8503 case Instruction::SExt:
8504 case Instruction::FPToUI:
8505 case Instruction::FPToSI:
8506 case Instruction::FPExt:
8507 case Instruction::PtrToInt:
8508 case Instruction::IntToPtr:
8509 case Instruction::SIToFP:
8510 case Instruction::UIToFP:
8511 case Instruction::Trunc:
8512 case Instruction::FPTrunc:
8513 case Instruction::BitCast: {
8514 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8515 std::make_pair(std::numeric_limits<unsigned>::min(),
8516 std::numeric_limits<unsigned>::max()));
8517 if (ShuffleOrOp == Instruction::ZExt ||
8518 ShuffleOrOp == Instruction::SExt) {
8519 CastMaxMinBWSizes = std::make_pair(
8525 }
else if (ShuffleOrOp == Instruction::Trunc) {
8526 CastMaxMinBWSizes = std::make_pair(
8533 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8534 ReuseShuffleIndices);
8537 TE->setOperand(*
this);
8539 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8540 if (ShuffleOrOp == Instruction::Trunc) {
8541 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8542 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8543 ShuffleOrOp == Instruction::UIToFP) {
8544 unsigned NumSignBits =
8546 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8548 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8550 if (NumSignBits * 2 >=
8552 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8556 case Instruction::ICmp:
8557 case Instruction::FCmp: {
8560 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8561 ReuseShuffleIndices);
8565 VLOperands Ops(VL, VL0, *
this);
8570 "Commutative Predicate mismatch");
8572 Left = Ops.getVL(0);
8573 Right = Ops.getVL(1);
8576 for (
Value *V : VL) {
8577 if (isa<PoisonValue>(V)) {
8582 auto *
Cmp = cast<CmpInst>(V);
8585 if (
Cmp->getPredicate() != P0)
8587 Left.push_back(LHS);
8588 Right.push_back(RHS);
8595 if (ShuffleOrOp == Instruction::ICmp) {
8596 unsigned NumSignBits0 =
8598 if (NumSignBits0 * 2 >=
8600 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8601 unsigned NumSignBits1 =
8603 if (NumSignBits1 * 2 >=
8605 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8609 case Instruction::Select:
8610 case Instruction::FNeg:
8611 case Instruction::Add:
8612 case Instruction::FAdd:
8613 case Instruction::Sub:
8614 case Instruction::FSub:
8615 case Instruction::Mul:
8616 case Instruction::FMul:
8617 case Instruction::UDiv:
8618 case Instruction::SDiv:
8619 case Instruction::FDiv:
8620 case Instruction::URem:
8621 case Instruction::SRem:
8622 case Instruction::FRem:
8623 case Instruction::Shl:
8624 case Instruction::LShr:
8625 case Instruction::AShr:
8626 case Instruction::And:
8627 case Instruction::Or:
8628 case Instruction::Xor:
8629 case Instruction::Freeze: {
8630 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8631 ReuseShuffleIndices);
8634 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8636 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8639 case Instruction::GetElementPtr: {
8640 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8641 ReuseShuffleIndices);
8645 for (
Value *V : VL) {
8646 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8651 Operands.front().push_back(
GEP->getPointerOperand());
8662 [VL0Ty, IndexIdx](
Value *V) {
8663 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8666 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8670 ->getPointerOperandType()
8673 for (
Value *V : VL) {
8674 auto *
I = dyn_cast<GetElementPtrInst>(V);
8677 ConstantInt::get(Ty, 0,
false));
8680 auto *
Op =
I->getOperand(IndexIdx);
8681 auto *CI = dyn_cast<ConstantInt>(
Op);
8686 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8690 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8694 case Instruction::Store: {
8695 bool Consecutive = CurrentOrder.empty();
8698 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8699 ReuseShuffleIndices, CurrentOrder);
8700 TE->setOperand(*
this);
8701 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8705 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
8708 case Instruction::Call: {
8711 CallInst *CI = cast<CallInst>(VL0);
8714 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8715 ReuseShuffleIndices);
8717 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8722 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8726 case Instruction::ShuffleVector: {
8727 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8728 ReuseShuffleIndices);
8732 auto *CI = dyn_cast<CmpInst>(VL0);
8734 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8736 auto *MainCI = cast<CmpInst>(S.getMainOp());
8737 auto *AltCI = cast<CmpInst>(S.getAltOp());
8741 "Expected different main/alternate predicates.");
8745 for (
Value *V : VL) {
8746 if (isa<PoisonValue>(V)) {
8751 auto *
Cmp = cast<CmpInst>(V);
8762 Left.push_back(LHS);
8763 Right.push_back(RHS);
8772 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8774 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8787 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8790 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8792 for (
const auto *Ty : ST->elements())
8793 if (Ty != *ST->element_begin())
8795 N *= ST->getNumElements();
8796 EltTy = *ST->element_begin();
8797 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8798 N *= AT->getNumElements();
8799 EltTy = AT->getElementType();
8801 auto *VT = cast<FixedVectorType>(EltTy);
8802 N *= VT->getNumElements();
8803 EltTy = VT->getElementType();
8810 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8818 bool ResizeAllowed)
const {
8819 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8820 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8821 auto *E0 = cast<Instruction>(*It);
8823 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8827 Value *Vec = E0->getOperand(0);
8829 CurrentOrder.
clear();
8833 if (E0->getOpcode() == Instruction::ExtractValue) {
8838 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8842 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8845 unsigned E = VL.
size();
8846 if (!ResizeAllowed && NElts != E)
8849 unsigned MinIdx = NElts, MaxIdx = 0;
8851 auto *Inst = dyn_cast<Instruction>(V);
8854 if (Inst->getOperand(0) != Vec)
8856 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8857 if (isa<UndefValue>(EE->getIndexOperand()))
8862 const unsigned ExtIdx = *
Idx;
8863 if (ExtIdx >= NElts)
8865 Indices[
I] = ExtIdx;
8866 if (MinIdx > ExtIdx)
8868 if (MaxIdx < ExtIdx)
8871 if (MaxIdx - MinIdx + 1 > E)
8873 if (MaxIdx + 1 <= E)
8877 bool ShouldKeepOrder =
true;
8883 CurrentOrder.
assign(E, E);
8884 for (
unsigned I = 0;
I < E; ++
I) {
8887 const unsigned ExtIdx = Indices[
I] - MinIdx;
8888 if (CurrentOrder[ExtIdx] != E) {
8889 CurrentOrder.
clear();
8892 ShouldKeepOrder &= ExtIdx ==
I;
8893 CurrentOrder[ExtIdx] =
I;
8895 if (ShouldKeepOrder)
8896 CurrentOrder.
clear();
8898 return ShouldKeepOrder;
8901bool BoUpSLP::areAllUsersVectorized(
8903 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
8905 return ScalarToTreeEntry.contains(U) ||
8906 isVectorLikeInstWithConstOps(U) ||
8907 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8911static std::pair<InstructionCost, InstructionCost>
8919 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
8920 FMF = FPCI->getFastMathFlags();
8923 dyn_cast<IntrinsicInst>(CI));
8924 auto IntrinsicCost =
8931 auto LibCost = IntrinsicCost;
8938 return {IntrinsicCost, LibCost};
8941void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8945 unsigned Sz = Scalars.size();
8948 if (!ReorderIndices.empty())
8950 for (
unsigned I = 0;
I < Sz; ++
I) {
8952 if (!ReorderIndices.empty())
8954 if (isa<PoisonValue>(Scalars[
Idx]))
8956 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
8957 if (IsAltOp(OpInst)) {
8967 if (!ReuseShuffleIndices.
empty()) {
8970 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
8980 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
8981 auto *AltCI = cast<CmpInst>(AltOp);
8984 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
8985 auto *CI = cast<CmpInst>(
I);
8993 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
8994 "CmpInst expected to match either main or alternate predicate or "
8997 return MainP !=
P && MainP != SwappedP;
9004 const auto *Op0 = Ops.
front();
9010 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9014 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9016 if (
auto *CI = dyn_cast<ConstantInt>(V))
9017 return CI->getValue().isPowerOf2();
9020 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9022 if (
auto *CI = dyn_cast<ConstantInt>(V))
9023 return CI->getValue().isNegatedPowerOf2();
9028 if (IsConstant && IsUniform)
9030 else if (IsConstant)
9044class BaseShuffleAnalysis {
9046 Type *ScalarTy =
nullptr;
9048 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9056 unsigned getVF(
Value *V)
const {
9057 assert(V &&
"V cannot be nullptr");
9058 assert(isa<FixedVectorType>(
V->getType()) &&
9059 "V does not have FixedVectorType");
9060 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9062 unsigned VNumElements =
9063 cast<FixedVectorType>(
V->getType())->getNumElements();
9064 assert(VNumElements > ScalarTyNumElements &&
9065 "the number of elements of V is not large enough");
9066 assert(VNumElements % ScalarTyNumElements == 0 &&
9067 "the number of elements of V is not a vectorized value");
9068 return VNumElements / ScalarTyNumElements;
9076 int Limit =
Mask.size();
9088 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9104 unsigned VF =
Mask.size();
9106 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9109 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9150 bool SinglePermute) {
9154 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9156 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9162 if (isIdentityMask(Mask, SVTy,
false)) {
9163 if (!IdentityOp || !SinglePermute ||
9164 (isIdentityMask(Mask, SVTy,
true) &&
9166 IdentityMask.
size()))) {
9171 IdentityMask.
assign(Mask);
9191 if (SV->isZeroEltSplat()) {
9193 IdentityMask.
assign(Mask);
9195 int LocalVF =
Mask.size();
9197 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9198 LocalVF = SVOpTy->getNumElements();
9202 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9204 ExtMask[
Idx] = SV->getMaskValue(
I);
9214 if (!IsOp1Undef && !IsOp2Undef) {
9216 for (
int &
I : Mask) {
9219 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9226 combineMasks(LocalVF, ShuffleMask, Mask);
9227 Mask.swap(ShuffleMask);
9229 Op = SV->getOperand(0);
9231 Op = SV->getOperand(1);
9233 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9234 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9239 "Expected masks of same sizes.");
9244 Mask.swap(IdentityMask);
9245 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9246 return SinglePermute &&
9247 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9249 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9250 Shuffle->isZeroEltSplat() &&
9263 template <
typename T,
typename ShuffleBuilderTy>
9265 ShuffleBuilderTy &Builder) {
9266 assert(V1 &&
"Expected at least one vector value.");
9268 Builder.resizeToMatch(V1, V2);
9269 int VF =
Mask.size();
9270 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9271 VF = FTy->getNumElements();
9272 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9279 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9282 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9284 CombinedMask1[
I] =
Mask[
I];
9286 CombinedMask2[
I] =
Mask[
I] - VF;
9293 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9294 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9297 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9298 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9303 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9306 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9308 ExtMask1, UseMask::SecondArg);
9313 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9316 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9318 ExtMask2, UseMask::SecondArg);
9319 if (SV1->getOperand(0)->getType() ==
9320 SV2->getOperand(0)->getType() &&
9321 SV1->getOperand(0)->getType() != SV1->getType() &&
9324 Op1 = SV1->getOperand(0);
9325 Op2 = SV2->getOperand(0);
9327 int LocalVF = ShuffleMask1.size();
9328 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9329 LocalVF = FTy->getNumElements();
9330 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9331 CombinedMask1.swap(ShuffleMask1);
9333 LocalVF = ShuffleMask2.size();
9334 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9335 LocalVF = FTy->getNumElements();
9336 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9337 CombinedMask2.swap(ShuffleMask2);
9340 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9341 Builder.resizeToMatch(Op1, Op2);
9342 VF = std::max(cast<VectorType>(Op1->
getType())
9344 .getKnownMinValue(),
9345 cast<VectorType>(Op2->
getType())
9347 .getKnownMinValue());
9348 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9351 "Expected undefined mask element");
9352 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9358 isa<ShuffleVectorInst>(Op1) &&
9359 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9361 return Builder.createIdentity(Op1);
9362 return Builder.createShuffleVector(
9366 if (isa<PoisonValue>(V1))
9367 return Builder.createPoison(
9368 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9370 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9371 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9374 return Builder.createShuffleVector(V1, NewMask);
9375 return Builder.createIdentity(V1);
9381static std::pair<InstructionCost, InstructionCost>
9392 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9402 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9406 for (
Value *V : Ptrs) {
9411 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9416 if (!
Ptr || !
Ptr->hasOneUse())
9420 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9426 TTI::PointersChainInfo::getKnownStride(),
9436 [](
const Value *V) {
9437 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9438 return Ptr && !
Ptr->hasAllConstantIndices();
9440 ? TTI::PointersChainInfo::getUnknownStride()
9441 : TTI::PointersChainInfo::getKnownStride();
9445 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9447 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9448 if (It != Ptrs.
end())
9449 BaseGEP = cast<GEPOperator>(*It);
9454 BaseGEP->getPointerOperand(), Indices, VecTy,
9459 return std::make_pair(ScalarCost, VecCost);
9462void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9463 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9464 "Expected gather node without reordering.");
9470 if (
TE.Scalars.size() == 2 || (
TE.getOpcode() && !
TE.isAltShuffle()) ||
9474 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9475 return VectorizableTree[Idx]->isSame(TE.Scalars);
9479 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9484 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9485 if (LIt != LoadsMap.
end()) {
9486 for (
LoadInst *RLI : LIt->second) {
9492 for (
LoadInst *RLI : LIt->second) {
9499 if (LIt->second.size() > 2) {
9501 hash_value(LIt->second.back()->getPointerOperand());
9507 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9512 bool IsOrdered =
true;
9513 unsigned NumInstructions = 0;
9518 if (
auto *Inst = dyn_cast<Instruction>(V);
9519 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9525 auto &Container = SortedValues[
Key];
9526 if (IsOrdered && !KeyToIndex.
contains(V) &&
9527 !(isa<Constant, ExtractElementInst>(V) ||
9529 ((Container.contains(
Idx) &&
9530 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9531 (!Container.empty() && !Container.contains(
Idx) &&
9532 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9534 auto &KTI = KeyToIndex[
V];
9536 Container[
Idx].push_back(V);
9541 if (!IsOrdered && NumInstructions > 1) {
9543 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9544 for (
const auto &
D : SortedValues) {
9545 for (
const auto &
P :
D.second) {
9547 for (
Value *V :
P.second) {
9550 TE.ReorderIndices[Cnt +
K] =
Idx;
9551 TE.Scalars[Cnt +
K] =
V;
9553 Sz += Indices.
size();
9554 Cnt += Indices.
size();
9556 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9558 *
TTI,
TE.Scalars.front()->getType(), Sz);
9560 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9562 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9563 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9570 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9575 auto *ScalarTy =
TE.Scalars.front()->getType();
9577 for (
auto [
Idx, Sz] : SubVectors) {
9581 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9586 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9587 if (DemandedElts[
I])
9590 CostKind,
I * ScalarTyNumElements, FTy);
9595 int Sz =
TE.Scalars.size();
9597 TE.ReorderIndices.end());
9598 for (
unsigned I : seq<unsigned>(Sz)) {
9600 if (isa<PoisonValue>(V)) {
9603 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9607 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9610 VecTy, ReorderMask);
9613 for (
unsigned I : seq<unsigned>(Sz)) {
9617 if (!isa<PoisonValue>(V))
9620 ReorderMask[
I] =
I + Sz;
9624 VecTy, DemandedElts,
true,
false,
CostKind);
9627 if (
Cost >= BVCost) {
9630 TE.ReorderIndices.clear();
9636 BaseGraphSize = VectorizableTree.size();
9638 class GraphTransformModeRAAI {
9639 bool &SavedIsGraphTransformMode;
9642 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9643 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9644 IsGraphTransformMode =
true;
9646 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9647 } TransformContext(IsGraphTransformMode);
9656 const InstructionsState &S) {
9658 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9660 I2->getOperand(
Op));
9662 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9664 [](
const std::pair<Value *, Value *> &
P) {
9665 return isa<Constant>(
P.first) ||
9666 isa<Constant>(
P.second) ||
P.first ==
P.second;
9673 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9674 TreeEntry &E = *VectorizableTree[
Idx];
9676 reorderGatherNode(E);
9680 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9681 TreeEntry &E = *VectorizableTree[
Idx];
9688 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9689 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9695 unsigned StartIdx = 0;
9700 *
TTI, VL.
front()->getType(), VF - 1)) {
9701 if (StartIdx + VF >
End)
9704 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9708 if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9709 SE || getTreeEntry(Slice.
back())) {
9712 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9720 bool IsSplat =
isSplat(Slice);
9721 if (Slices.
empty() || !IsSplat ||
9723 Slice.
front()->getType(), VF)),
9726 Slice.
front()->getType(), 2 * VF)),
9729 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9734 if (!S.getOpcode() || S.isAltShuffle() || !
allSameBlock(Slice) ||
9735 (S.getOpcode() == Instruction::Load &&
9742 if ((!UserIgnoreList || E.Idx != 0) &&
9746 if (isa<PoisonValue>(V))
9748 return areAllUsersVectorized(cast<Instruction>(V),
9752 if (S.getOpcode() == Instruction::Load) {
9764 if (UserIgnoreList && E.Idx == 0)
9769 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9772 !CheckOperandsProfitability(
9775 IsaPred<Instruction>)),
9786 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9787 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9788 if (StartIdx == Cnt)
9789 StartIdx = Cnt + Sz;
9790 if (
End == Cnt + Sz)
9793 for (
auto [Cnt, Sz] : Slices) {
9796 if (TreeEntry *SE = getTreeEntry(Slice.
front());
9797 SE || getTreeEntry(Slice.
back())) {
9800 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9802 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9803 AddCombinedNode(SE->Idx, Cnt, Sz);
9806 unsigned PrevSize = VectorizableTree.size();
9807 [[maybe_unused]]
unsigned PrevEntriesSize =
9808 LoadEntriesToVectorize.size();
9809 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9810 if (PrevSize + 1 == VectorizableTree.size() &&
9811 VectorizableTree[PrevSize]->isGather() &&
9812 VectorizableTree[PrevSize]->getOpcode() !=
9813 Instruction::ExtractElement &&
9815 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9817 VectorizableTree.pop_back();
9818 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9819 "LoadEntriesToVectorize expected to remain the same");
9822 AddCombinedNode(PrevSize, Cnt, Sz);
9826 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9829 E.ReorderIndices.clear();
9832 switch (E.getOpcode()) {
9833 case Instruction::Load: {
9836 if (E.State != TreeEntry::Vectorize)
9838 Type *ScalarTy = E.getMainOp()->getType();
9840 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9843 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9847 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9854 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9855 false, CommonAlignment,
CostKind, BaseLI);
9856 if (StridedCost < OriginalVecCost)
9859 E.State = TreeEntry::StridedVectorize;
9863 case Instruction::Store: {
9865 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9867 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9870 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9874 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9881 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9882 false, CommonAlignment,
CostKind, BaseSI);
9883 if (StridedCost < OriginalVecCost)
9886 E.State = TreeEntry::StridedVectorize;
9887 }
else if (!E.ReorderIndices.empty()) {
9890 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
9891 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
9892 if (Mask.size() < 4)
9894 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
9898 VecTy, Factor, BaseSI->getAlign(),
9906 unsigned InterleaveFactor = IsInterleaveMask(Mask);
9907 if (InterleaveFactor != 0)
9908 E.setInterleave(InterleaveFactor);
9912 case Instruction::Select: {
9913 if (E.State != TreeEntry::Vectorize)
9919 E.CombinedOp = TreeEntry::MinMax;
9920 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
9921 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9922 CondEntry->State == TreeEntry::Vectorize) {
9924 CondEntry->State = TreeEntry::CombinedVectorize;
9933 if (LoadEntriesToVectorize.empty()) {
9935 if (VectorizableTree.size() <= 1 &&
9936 VectorizableTree.front()->getOpcode() == Instruction::Load)
9939 constexpr unsigned SmallTree = 3;
9940 constexpr unsigned SmallVF = 2;
9941 if ((VectorizableTree.size() <= SmallTree &&
9942 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9943 (VectorizableTree.size() <= 2 && UserIgnoreList))
9946 if (VectorizableTree.front()->isNonPowOf2Vec() &&
9950 [](
const std::unique_ptr<TreeEntry> &TE) {
9951 return TE->isGather() &&
9952 TE->getOpcode() == Instruction::Load &&
9964 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9967 (E.getOpcode() == Instruction::Load ||
9968 (!E.getOpcode() &&
any_of(E.Scalars,
9970 return isa<LoadInst>(V) &&
9972 !isDeleted(cast<Instruction>(V));
9975 for (
Value *V : E.Scalars) {
9976 auto *LI = dyn_cast<LoadInst>(V);
9982 *
this, V, *DL, *SE, *
TTI,
9983 GatheredLoads[std::make_tuple(
9991 if (!GatheredLoads.
empty())
9992 tryToVectorizeGatheredLoads(GatheredLoads);
10002 bool IsFinalized =
false;
10015 bool SameNodesEstimated =
true;
10024 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10040 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10041 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10044 count(VL, *It) > 1 &&
10046 if (!NeedShuffle) {
10047 if (isa<FixedVectorType>(ScalarTy)) {
10052 cast<FixedVectorType>(ScalarTy));
10055 CostKind, std::distance(VL.
begin(), It),
10061 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10068 VecTy, ShuffleMask, CostKind,
10072 return GatherCost +
10073 (
all_of(Gathers, IsaPred<UndefValue>)
10075 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10083 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10084 unsigned NumParts) {
10085 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10087 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10088 auto *EE = dyn_cast<ExtractElementInst>(V);
10091 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10094 return std::max(Sz, VecTy->getNumElements());
10100 -> std::optional<TTI::ShuffleKind> {
10101 if (NumElts <= EltsPerVector)
10102 return std::nullopt;
10104 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10106 if (I == PoisonMaskElem)
10108 return std::min(S, I);
10111 int OffsetReg1 = OffsetReg0;
10115 int FirstRegId = -1;
10116 Indices.assign(1, OffsetReg0);
10120 int Idx =
I - OffsetReg0;
10122 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10123 if (FirstRegId < 0)
10124 FirstRegId = RegId;
10125 RegIndices.
insert(RegId);
10126 if (RegIndices.
size() > 2)
10127 return std::nullopt;
10128 if (RegIndices.
size() == 2) {
10130 if (Indices.
size() == 1) {
10133 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10134 [&](
int S,
int I) {
10135 if (I == PoisonMaskElem)
10137 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10138 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10139 if (RegId == FirstRegId)
10141 return std::min(S, I);
10144 Indices.push_back(OffsetReg1 % NumElts);
10146 Idx =
I - OffsetReg1;
10148 I = (
Idx % NumElts) % EltsPerVector +
10149 (RegId == FirstRegId ? 0 : EltsPerVector);
10151 return ShuffleKind;
10158 for (
unsigned Part : seq<unsigned>(NumParts)) {
10159 if (!ShuffleKinds[Part])
10162 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10166 std::optional<TTI::ShuffleKind> RegShuffleKind =
10167 CheckPerRegistersShuffle(SubMask, Indices);
10168 if (!RegShuffleKind) {
10171 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10184 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10185 for (
unsigned Idx : Indices) {
10186 assert((
Idx + EltsPerVector) <= BaseVF &&
10187 "SK_ExtractSubvector index out of range");
10198 if (OriginalCost <
Cost)
10199 Cost = OriginalCost;
10207 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10214 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10216 unsigned SliceSize) {
10217 if (SameNodesEstimated) {
10223 if ((InVectors.
size() == 2 &&
10224 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10225 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10226 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10227 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10230 "Expected all poisoned elements.");
10232 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10237 Cost += createShuffle(InVectors.
front(),
10238 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10240 transformMaskAfterShuffle(CommonMask, CommonMask);
10241 }
else if (InVectors.
size() == 2) {
10242 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10243 transformMaskAfterShuffle(CommonMask, CommonMask);
10245 SameNodesEstimated =
false;
10246 if (!E2 && InVectors.
size() == 1) {
10247 unsigned VF = E1.getVectorFactor();
10250 cast<FixedVectorType>(V1->
getType())->getNumElements());
10252 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10253 VF = std::max(VF, E->getVectorFactor());
10255 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10257 CommonMask[
Idx] = Mask[
Idx] + VF;
10258 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10259 transformMaskAfterShuffle(CommonMask, CommonMask);
10261 auto P = InVectors.
front();
10262 Cost += createShuffle(&E1, E2, Mask);
10263 unsigned VF = Mask.size();
10268 const auto *E = cast<const TreeEntry *>(
P);
10269 VF = std::max(VF, E->getVectorFactor());
10271 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10273 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10274 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10275 transformMaskAfterShuffle(CommonMask, CommonMask);
10279 class ShuffleCostBuilder {
10282 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10284 return Mask.empty() ||
10285 (VF == Mask.size() &&
10293 ~ShuffleCostBuilder() =
default;
10298 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10299 if (isEmptyOrIdentity(Mask, VF))
10302 cast<VectorType>(V1->
getType()), Mask);
10307 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10308 if (isEmptyOrIdentity(Mask, VF))
10311 cast<VectorType>(V1->
getType()), Mask);
10317 void resizeToMatch(
Value *&,
Value *&)
const {}
10327 ShuffleCostBuilder Builder(
TTI);
10330 unsigned CommonVF = Mask.size();
10332 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10336 Type *EScalarTy = E.Scalars.front()->getType();
10337 bool IsSigned =
true;
10338 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10340 IsSigned = It->second.second;
10342 if (EScalarTy != ScalarTy) {
10343 unsigned CastOpcode = Instruction::Trunc;
10344 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10345 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10347 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10355 if (isa<Constant>(V))
10357 auto *VecTy = cast<VectorType>(V->getType());
10359 if (EScalarTy != ScalarTy) {
10361 unsigned CastOpcode = Instruction::Trunc;
10362 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10363 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10365 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10372 if (!V1 && !V2 && !P2.
isNull()) {
10374 const TreeEntry *E = cast<const TreeEntry *>(P1);
10375 unsigned VF = E->getVectorFactor();
10376 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10377 CommonVF = std::max(VF, E2->getVectorFactor());
10380 return Idx < 2 * static_cast<int>(CommonVF);
10382 "All elements in mask must be less than 2 * CommonVF.");
10383 if (E->Scalars.size() == E2->Scalars.size()) {
10387 for (
int &
Idx : CommonMask) {
10390 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10392 else if (
Idx >=
static_cast<int>(CommonVF))
10393 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10397 CommonVF = E->Scalars.size();
10398 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10399 GetNodeMinBWAffectedCost(*E2, CommonVF);
10401 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10402 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10405 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10406 }
else if (!V1 && P2.
isNull()) {
10408 const TreeEntry *E = cast<const TreeEntry *>(P1);
10409 unsigned VF = E->getVectorFactor();
10413 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10414 "All elements in mask must be less than CommonVF.");
10415 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10417 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10418 for (
int &
Idx : CommonMask) {
10422 CommonVF = E->Scalars.size();
10423 }
else if (
unsigned Factor = E->getInterleaveFactor();
10424 Factor > 0 && E->Scalars.size() != Mask.size() &&
10428 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10430 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10433 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10434 CommonVF == CommonMask.
size() &&
10436 [](
const auto &&
P) {
10438 static_cast<unsigned>(
P.value()) !=
P.index();
10446 }
else if (V1 && P2.
isNull()) {
10448 ExtraCost += GetValueMinBWAffectedCost(V1);
10449 CommonVF = getVF(V1);
10452 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10453 "All elements in mask must be less than CommonVF.");
10454 }
else if (V1 && !V2) {
10456 unsigned VF = getVF(V1);
10457 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10458 CommonVF = std::max(VF, E2->getVectorFactor());
10461 return Idx < 2 * static_cast<int>(CommonVF);
10463 "All elements in mask must be less than 2 * CommonVF.");
10464 if (E2->Scalars.size() == VF && VF != CommonVF) {
10466 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10467 for (
int &
Idx : CommonMask) {
10470 if (
Idx >=
static_cast<int>(CommonVF))
10471 Idx = E2Mask[
Idx - CommonVF] + VF;
10475 ExtraCost += GetValueMinBWAffectedCost(V1);
10477 ExtraCost += GetNodeMinBWAffectedCost(
10478 *E2, std::min(CommonVF, E2->getVectorFactor()));
10479 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10480 }
else if (!V1 && V2) {
10482 unsigned VF = getVF(V2);
10483 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10484 CommonVF = std::max(VF, E1->getVectorFactor());
10487 return Idx < 2 * static_cast<int>(CommonVF);
10489 "All elements in mask must be less than 2 * CommonVF.");
10490 if (E1->Scalars.size() == VF && VF != CommonVF) {
10492 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10493 for (
int &
Idx : CommonMask) {
10496 if (
Idx >=
static_cast<int>(CommonVF))
10497 Idx = E1Mask[
Idx - CommonVF] + VF;
10503 ExtraCost += GetNodeMinBWAffectedCost(
10504 *E1, std::min(CommonVF, E1->getVectorFactor()));
10506 ExtraCost += GetValueMinBWAffectedCost(V2);
10507 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10509 assert(V1 && V2 &&
"Expected both vectors.");
10510 unsigned VF = getVF(V1);
10511 CommonVF = std::max(VF, getVF(V2));
10514 return Idx < 2 * static_cast<int>(CommonVF);
10516 "All elements in mask must be less than 2 * CommonVF.");
10518 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10519 if (V1->
getType() != V2->getType()) {
10521 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10523 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10525 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10526 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10529 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10534 InVectors.
front() =
10536 if (InVectors.
size() == 2)
10538 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10539 V1, V2, CommonMask, Builder);
10546 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10547 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10548 CheckedExtracts(CheckedExtracts) {}
10550 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10551 unsigned NumParts,
bool &UseVecBaseAsInput) {
10552 UseVecBaseAsInput =
false;
10555 Value *VecBase =
nullptr;
10557 if (!E->ReorderIndices.empty()) {
10559 E->ReorderIndices.end());
10564 bool PrevNodeFound =
any_of(
10566 [&](
const std::unique_ptr<TreeEntry> &TE) {
10567 return ((!TE->isAltShuffle() &&
10568 TE->getOpcode() == Instruction::ExtractElement) ||
10570 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10571 return VL.size() > Data.index() &&
10572 (Mask[Data.index()] == PoisonMaskElem ||
10573 isa<UndefValue>(VL[Data.index()]) ||
10574 Data.value() == VL[Data.index()]);
10579 for (
unsigned Part : seq<unsigned>(NumParts)) {
10581 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10585 if (isa<UndefValue>(V) ||
10594 auto *EE = cast<ExtractElementInst>(V);
10595 VecBase = EE->getVectorOperand();
10596 UniqueBases.
insert(VecBase);
10597 const TreeEntry *VE = R.getTreeEntry(V);
10598 if (!CheckedExtracts.
insert(V).second ||
10599 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10602 return isa<GetElementPtrInst>(U) &&
10603 !R.areAllUsersVectorized(cast<Instruction>(U),
10611 unsigned Idx = *EEIdx;
10613 if (EE->hasOneUse() || !PrevNodeFound) {
10615 if (isa<SExtInst, ZExtInst>(Ext) &&
10616 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10621 EE->getVectorOperandType(),
Idx);
10624 Ext->getOpcode(), Ext->getType(), EE->getType(),
10639 if (!PrevNodeFound)
10640 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10643 transformMaskAfterShuffle(CommonMask, CommonMask);
10644 SameNodesEstimated =
false;
10645 if (NumParts != 1 && UniqueBases.
size() != 1) {
10646 UseVecBaseAsInput =
true;
10654 std::optional<InstructionCost>
10658 return std::nullopt;
10664 return Idx < static_cast<int>(E1.getVectorFactor());
10666 "Expected single vector shuffle mask.");
10670 if (InVectors.
empty()) {
10671 CommonMask.
assign(Mask.begin(), Mask.end());
10672 InVectors.
assign({&E1, &E2});
10675 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10678 if (NumParts == 0 || NumParts >= Mask.size() ||
10679 MaskVecTy->getNumElements() % NumParts != 0 ||
10681 MaskVecTy->getNumElements() / NumParts))
10686 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10687 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10690 if (InVectors.
empty()) {
10691 CommonMask.
assign(Mask.begin(), Mask.end());
10692 InVectors.
assign(1, &E1);
10695 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10698 if (NumParts == 0 || NumParts >= Mask.size() ||
10699 MaskVecTy->getNumElements() % NumParts != 0 ||
10701 MaskVecTy->getNumElements() / NumParts))
10706 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10707 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10708 if (!SameNodesEstimated && InVectors.
size() == 1)
10720 auto *EI = cast<ExtractElementInst>(
10721 cast<const TreeEntry *>(InVectors.
front())
10722 ->getOrdered(
P.index()));
10723 return EI->getVectorOperand() == V1 ||
10724 EI->getVectorOperand() == V2;
10726 "Expected extractelement vectors.");
10730 if (InVectors.
empty()) {
10732 "Expected empty input mask/vectors.");
10733 CommonMask.
assign(Mask.begin(), Mask.end());
10734 InVectors.
assign(1, V1);
10740 InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors.
front()) &&
10741 !CommonMask.
empty() &&
10745 InVectors.
front().get<const TreeEntry *>()->getOrdered(
10748 return P.value() == Mask[
P.index()] ||
10749 isa<UndefValue>(Scalar);
10750 if (isa<Constant>(V1))
10752 auto *EI = cast<ExtractElementInst>(Scalar);
10753 return EI->getVectorOperand() == V1;
10755 "Expected only tree entry for extractelement vectors.");
10759 "Expected only tree entries from extracts/reused buildvectors.");
10760 unsigned VF = getVF(V1);
10761 if (InVectors.
size() == 2) {
10762 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10763 transformMaskAfterShuffle(CommonMask, CommonMask);
10764 VF = std::max<unsigned>(VF, CommonMask.
size());
10765 }
else if (
const auto *InTE =
10766 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10767 VF = std::max(VF, InTE->getVectorFactor());
10770 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10771 ->getNumElements());
10774 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10776 CommonMask[
Idx] = Mask[
Idx] + VF;
10779 Value *Root =
nullptr) {
10780 Cost += getBuildVectorCost(VL, Root);
10784 unsigned VF = VL.
size();
10786 VF = std::min(VF, MaskVF);
10788 if (isa<UndefValue>(V)) {
10794 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10801 Type *ScalarTy = V->getType()->getScalarType();
10803 if (isa<PoisonValue>(V))
10805 else if (isa<UndefValue>(V))
10809 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10812 Vals.
swap(NewVals);
10818 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10825 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10828 IsFinalized =
true;
10831 if (InVectors.
size() == 2)
10832 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10834 Cost += createShuffle(Vec,
nullptr, CommonMask);
10835 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10839 "Expected vector length for the final value before action.");
10840 Value *V = cast<Value *>(Vec);
10841 Action(V, CommonMask);
10842 InVectors.
front() = V;
10844 if (!SubVectors.empty()) {
10846 if (InVectors.
size() == 2)
10847 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10849 Cost += createShuffle(Vec,
nullptr, CommonMask);
10850 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10854 if (!SubVectorsMask.
empty()) {
10856 "Expected same size of masks for subvectors and common mask.");
10858 copy(SubVectorsMask, SVMask.begin());
10859 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10862 I1 = I2 + CommonMask.
size();
10869 for (
auto [E,
Idx] : SubVectors) {
10870 Type *EScalarTy = E->Scalars.front()->getType();
10871 bool IsSigned =
true;
10872 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10875 IsSigned = It->second.second;
10877 if (ScalarTy != EScalarTy) {
10878 unsigned CastOpcode = Instruction::Trunc;
10879 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10880 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10882 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10892 if (!CommonMask.
empty()) {
10893 std::iota(std::next(CommonMask.
begin(),
Idx),
10894 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
10901 if (CommonMask.
empty()) {
10902 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
10906 createShuffle(InVectors.
front(),
10907 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
10913 "Shuffle construction must be finalized.");
10917const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
10918 unsigned Idx)
const {
10919 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
10922 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10923 return TE->isGather() &&
10924 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
10925 return EI.EdgeIdx == Idx && EI.UserTE == E;
10926 }) != TE->UserTreeIndices.end();
10928 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
10933 if (TE.State == TreeEntry::ScatterVectorize ||
10934 TE.State == TreeEntry::StridedVectorize)
10936 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10937 !TE.isAltShuffle()) {
10938 if (TE.ReorderIndices.empty())
10952 const unsigned VF,
unsigned MinBW,
10984 auto It = MinBWs.
find(E);
10985 Type *OrigScalarTy = ScalarTy;
10986 if (It != MinBWs.
end()) {
10987 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
10993 unsigned EntryVF = E->getVectorFactor();
10996 if (E->isGather()) {
10999 if (isa<InsertElementInst>(VL[0]))
11001 if (isa<CmpInst>(VL.
front()))
11002 ScalarTy = VL.
front()->getType();
11003 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11004 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11008 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11011 if (E->getOpcode() == Instruction::Store) {
11013 NewMask.
resize(E->ReorderIndices.size());
11014 copy(E->ReorderIndices, NewMask.
begin());
11020 if (!E->ReuseShuffleIndices.empty())
11021 ::addMask(Mask, E->ReuseShuffleIndices);
11025 assert((E->State == TreeEntry::Vectorize ||
11026 E->State == TreeEntry::ScatterVectorize ||
11027 E->State == TreeEntry::StridedVectorize) &&
11028 "Unhandled state");
11029 assert(E->getOpcode() &&
11031 (E->getOpcode() == Instruction::GetElementPtr &&
11032 E->getMainOp()->getType()->isPointerTy())) &&
11035 unsigned ShuffleOrOp =
11036 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11037 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11038 ShuffleOrOp = E->CombinedOp;
11040 const unsigned Sz = UniqueValues.
size();
11042 for (
unsigned I = 0;
I < Sz; ++
I) {
11043 if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11045 UsedScalars.set(
I);
11047 auto GetCastContextHint = [&](
Value *
V) {
11048 if (
const TreeEntry *OpTE = getTreeEntry(V))
11049 return getCastContextHint(*OpTE);
11050 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11051 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
11060 if (isa<CastInst, CallInst>(VL0)) {
11064 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11066 for (
unsigned I = 0;
I < Sz; ++
I) {
11067 if (UsedScalars.test(
I))
11069 ScalarCost += ScalarEltCost(
I);
11078 (E->getOpcode() != Instruction::Load ||
11079 !E->UserTreeIndices.empty())) {
11080 const EdgeInfo &EI =
11081 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11082 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11084 if (EI.UserTE->getOpcode() != Instruction::Select ||
11086 auto UserBWIt = MinBWs.
find(EI.UserTE);
11087 Type *UserScalarTy =
11088 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11089 if (UserBWIt != MinBWs.
end())
11091 UserBWIt->second.first);
11092 if (ScalarTy != UserScalarTy) {
11093 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11094 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11095 unsigned VecOpcode;
11096 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11097 if (BWSz > SrcBWSz)
11098 VecOpcode = Instruction::Trunc;
11101 It->second.second ? Instruction::SExt : Instruction::ZExt;
11108 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11109 ScalarCost,
"Calculated costs for Tree"));
11110 return VecCost - ScalarCost;
11115 assert((E->State == TreeEntry::Vectorize ||
11116 E->State == TreeEntry::StridedVectorize) &&
11117 "Entry state expected to be Vectorize or StridedVectorize here.");
11121 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11122 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11123 "Calculated GEPs cost for Tree"));
11125 return VecCost - ScalarCost;
11132 Type *CanonicalType = Ty;
11139 {CanonicalType, CanonicalType});
11144 if (VI && SelectOnly) {
11146 "Expected only for scalar type.");
11147 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11149 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11150 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11151 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11153 return IntrinsicCost;
11155 switch (ShuffleOrOp) {
11156 case Instruction::PHI: {
11160 for (
Value *V : UniqueValues) {
11161 auto *
PHI = dyn_cast<PHINode>(V);
11166 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11170 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11172 if (!OpTE->ReuseShuffleIndices.empty())
11173 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11174 OpTE->Scalars.size());
11177 return CommonCost - ScalarCost;
11179 case Instruction::ExtractValue:
11180 case Instruction::ExtractElement: {
11181 auto GetScalarCost = [&](
unsigned Idx) {
11182 if (isa<PoisonValue>(UniqueValues[
Idx]))
11185 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11187 if (ShuffleOrOp == Instruction::ExtractElement) {
11188 auto *EE = cast<ExtractElementInst>(
I);
11189 SrcVecTy = EE->getVectorOperandType();
11191 auto *EV = cast<ExtractValueInst>(
I);
11192 Type *AggregateTy = EV->getAggregateOperand()->getType();
11194 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11195 NumElts = ATy->getNumElements();
11200 if (
I->hasOneUse()) {
11202 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11203 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11210 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11218 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11219 return GetCostDiff(GetScalarCost, GetVectorCost);
11221 case Instruction::InsertElement: {
11222 assert(E->ReuseShuffleIndices.empty() &&
11223 "Unique insertelements only are expected.");
11224 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11225 unsigned const NumElts = SrcVecTy->getNumElements();
11226 unsigned const NumScalars = VL.
size();
11232 unsigned OffsetEnd = OffsetBeg;
11233 InsertMask[OffsetBeg] = 0;
11236 if (OffsetBeg >
Idx)
11238 else if (OffsetEnd <
Idx)
11240 InsertMask[
Idx] =
I + 1;
11243 if (NumOfParts > 0 && NumOfParts < NumElts)
11244 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11245 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11247 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11248 unsigned InsertVecSz = std::min<unsigned>(
11250 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11251 bool IsWholeSubvector =
11252 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11256 if (OffsetBeg + InsertVecSz > VecSz) {
11259 InsertVecSz = VecSz;
11265 if (!E->ReorderIndices.empty()) {
11270 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11272 bool IsIdentity =
true;
11274 Mask.swap(PrevMask);
11275 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11277 DemandedElts.
setBit(InsertIdx);
11278 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11279 Mask[InsertIdx - OffsetBeg] =
I;
11281 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11295 InsertVecTy, Mask);
11296 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11297 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11305 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11306 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11307 if (InsertVecSz != VecSz) {
11318 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11327 case Instruction::ZExt:
11328 case Instruction::SExt:
11329 case Instruction::FPToUI:
11330 case Instruction::FPToSI:
11331 case Instruction::FPExt:
11332 case Instruction::PtrToInt:
11333 case Instruction::IntToPtr:
11334 case Instruction::SIToFP:
11335 case Instruction::UIToFP:
11336 case Instruction::Trunc:
11337 case Instruction::FPTrunc:
11338 case Instruction::BitCast: {
11339 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11342 unsigned Opcode = ShuffleOrOp;
11343 unsigned VecOpcode = Opcode;
11345 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11347 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11348 if (SrcIt != MinBWs.
end()) {
11349 SrcBWSz = SrcIt->second.first;
11356 if (BWSz == SrcBWSz) {
11357 VecOpcode = Instruction::BitCast;
11358 }
else if (BWSz < SrcBWSz) {
11359 VecOpcode = Instruction::Trunc;
11360 }
else if (It != MinBWs.
end()) {
11361 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11362 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11363 }
else if (SrcIt != MinBWs.
end()) {
11364 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11366 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11368 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11369 !SrcIt->second.second) {
11370 VecOpcode = Instruction::UIToFP;
11373 assert(
Idx == 0 &&
"Expected 0 index only");
11381 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11383 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11386 bool IsArithmeticExtendedReduction =
11387 E->Idx == 0 && UserIgnoreList &&
11389 auto *
I = cast<Instruction>(V);
11390 return is_contained({Instruction::Add, Instruction::FAdd,
11391 Instruction::Mul, Instruction::FMul,
11392 Instruction::And, Instruction::Or,
11396 if (IsArithmeticExtendedReduction &&
11397 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11399 return CommonCost +
11401 VecOpcode == Opcode ? VI :
nullptr);
11403 return GetCostDiff(GetScalarCost, GetVectorCost);
11405 case Instruction::FCmp:
11406 case Instruction::ICmp:
11407 case Instruction::Select: {
11411 match(VL0, MatchCmp))
11417 auto GetScalarCost = [&](
unsigned Idx) {
11418 if (isa<PoisonValue>(UniqueValues[
Idx]))
11421 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11428 !
match(VI, MatchCmp)) ||
11436 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11437 CostKind, getOperandInfo(
VI->getOperand(0)),
11438 getOperandInfo(
VI->getOperand(1)), VI);
11441 ScalarCost = IntrinsicCost;
11450 CostKind, getOperandInfo(E->getOperand(0)),
11451 getOperandInfo(E->getOperand(1)), VL0);
11452 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11455 unsigned CondNumElements = CondType->getNumElements();
11457 assert(VecTyNumElements >= CondNumElements &&
11458 VecTyNumElements % CondNumElements == 0 &&
11459 "Cannot vectorize Instruction::Select");
11460 if (CondNumElements != VecTyNumElements) {
11469 return VecCost + CommonCost;
11471 return GetCostDiff(GetScalarCost, GetVectorCost);
11473 case TreeEntry::MinMax: {
11474 auto GetScalarCost = [&](
unsigned Idx) {
11475 return GetMinMaxCost(OrigScalarTy);
11479 return VecCost + CommonCost;
11481 return GetCostDiff(GetScalarCost, GetVectorCost);
11483 case Instruction::FNeg:
11484 case Instruction::Add:
11485 case Instruction::FAdd:
11486 case Instruction::Sub:
11487 case Instruction::FSub:
11488 case Instruction::Mul:
11489 case Instruction::FMul:
11490 case Instruction::UDiv:
11491 case Instruction::SDiv:
11492 case Instruction::FDiv:
11493 case Instruction::URem:
11494 case Instruction::SRem:
11495 case Instruction::FRem:
11496 case Instruction::Shl:
11497 case Instruction::LShr:
11498 case Instruction::AShr:
11499 case Instruction::And:
11500 case Instruction::Or:
11501 case Instruction::Xor: {
11502 auto GetScalarCost = [&](
unsigned Idx) {
11503 if (isa<PoisonValue>(UniqueValues[
Idx]))
11506 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11507 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11516 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11517 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11520 auto *CI = dyn_cast<ConstantInt>(
Op);
11521 return CI && CI->getValue().countr_one() >= It->second.first;
11526 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11530 Op2Info, {},
nullptr, TLI) +
11533 return GetCostDiff(GetScalarCost, GetVectorCost);
11535 case Instruction::GetElementPtr: {
11536 return CommonCost + GetGEPCostDiff(VL, VL0);
11538 case Instruction::Load: {
11539 auto GetScalarCost = [&](
unsigned Idx) {
11540 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11542 VI->getAlign(),
VI->getPointerAddressSpace(),
11545 auto *LI0 = cast<LoadInst>(VL0);
11548 switch (E->State) {
11549 case TreeEntry::Vectorize:
11550 if (
unsigned Factor = E->getInterleaveFactor()) {
11552 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11553 LI0->getPointerAddressSpace(),
CostKind);
11557 Instruction::Load, VecTy, LI0->getAlign(),
11561 case TreeEntry::StridedVectorize: {
11562 Align CommonAlignment =
11563 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11565 Instruction::Load, VecTy, LI0->getPointerOperand(),
11566 false, CommonAlignment,
CostKind);
11569 case TreeEntry::ScatterVectorize: {
11570 Align CommonAlignment =
11571 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11573 Instruction::Load, VecTy, LI0->getPointerOperand(),
11574 false, CommonAlignment,
CostKind);
11577 case TreeEntry::CombinedVectorize:
11578 case TreeEntry::NeedToGather:
11581 return VecLdCost + CommonCost;
11587 if (E->State == TreeEntry::ScatterVectorize)
11593 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11594 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11596 case Instruction::Store: {
11597 bool IsReorder = !E->ReorderIndices.empty();
11598 auto GetScalarCost = [=](
unsigned Idx) {
11599 auto *
VI = cast<StoreInst>(VL[
Idx]);
11602 VI->getAlign(),
VI->getPointerAddressSpace(),
11606 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11610 if (E->State == TreeEntry::StridedVectorize) {
11611 Align CommonAlignment =
11612 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11614 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11615 false, CommonAlignment,
CostKind);
11617 assert(E->State == TreeEntry::Vectorize &&
11618 "Expected either strided or consecutive stores.");
11619 if (
unsigned Factor = E->getInterleaveFactor()) {
11620 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11621 "No reused shuffles expected");
11624 Instruction::Store, VecTy, Factor, std::nullopt,
11625 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11629 Instruction::Store, VecTy, BaseSI->getAlign(),
11630 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11633 return VecStCost + CommonCost;
11637 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11638 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11641 return GetCostDiff(GetScalarCost, GetVectorCost) +
11642 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11644 case Instruction::Call: {
11645 auto GetScalarCost = [&](
unsigned Idx) {
11646 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11657 auto *CI = cast<CallInst>(VL0);
11661 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11663 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11665 return GetCostDiff(GetScalarCost, GetVectorCost);
11667 case Instruction::ShuffleVector: {
11668 if (!
SLPReVec || E->isAltShuffle())
11669 assert(E->isAltShuffle() &&
11674 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11675 "Invalid Shuffle Vector Operand");
11678 auto TryFindNodeWithEqualOperands = [=]() {
11679 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11682 if (
TE->isAltShuffle() &&
11683 ((
TE->getOpcode() == E->getOpcode() &&
11684 TE->getAltOpcode() == E->getAltOpcode()) ||
11685 (
TE->getOpcode() == E->getAltOpcode() &&
11686 TE->getAltOpcode() == E->getOpcode())) &&
11687 TE->hasEqualOperands(*E))
11692 auto GetScalarCost = [&](
unsigned Idx) {
11693 if (isa<PoisonValue>(UniqueValues[
Idx]))
11696 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11697 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11707 if (TryFindNodeWithEqualOperands()) {
11709 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11716 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11718 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11719 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11721 VecCost = TTIRef.getCmpSelInstrCost(
11722 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11723 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11725 VecCost += TTIRef.getCmpSelInstrCost(
11726 E->getOpcode(), VecTy, MaskTy,
11727 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11728 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11731 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11734 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11735 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11737 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11738 if (SrcIt != MinBWs.
end()) {
11739 SrcBWSz = SrcIt->second.first;
11743 if (BWSz <= SrcBWSz) {
11744 if (BWSz < SrcBWSz)
11746 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11750 <<
"SLP: alternate extension, which should be truncated.\n";
11756 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11759 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11763 E->buildAltOpShuffleMask(
11765 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11776 unsigned Opcode0 = E->getOpcode();
11777 unsigned Opcode1 = E->getAltOpcode();
11781 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11783 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11784 return AltVecCost < VecCost ? AltVecCost : VecCost;
11789 if (
SLPReVec && !E->isAltShuffle())
11790 return GetCostDiff(
11795 "Not supported shufflevector usage.");
11796 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11797 unsigned SVNumElements =
11798 cast<FixedVectorType>(SV->getOperand(0)->getType())
11799 ->getNumElements();
11800 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11801 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11805 assert(isa<ShuffleVectorInst>(V) &&
11806 "Not supported shufflevector usage.");
11807 auto *SV = cast<ShuffleVectorInst>(V);
11809 [[maybe_unused]]
bool IsExtractSubvectorMask =
11810 SV->isExtractSubvectorMask(Index);
11811 assert(IsExtractSubvectorMask &&
11812 "Not supported shufflevector usage.");
11813 if (NextIndex != Index)
11815 NextIndex += SV->getShuffleMask().size();
11818 return ::getShuffleCost(
11824 return GetCostDiff(GetScalarCost, GetVectorCost);
11826 case Instruction::Freeze:
11833bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11835 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11837 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11839 return TE->isGather() &&
11841 [
this](
Value *V) { return EphValues.contains(V); }) &&
11843 TE->Scalars.size() < Limit ||
11844 ((
TE->getOpcode() == Instruction::ExtractElement ||
11845 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11847 (
TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()) ||
11848 any_of(
TE->Scalars, IsaPred<LoadInst>));
11852 if (VectorizableTree.size() == 1 &&
11853 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11854 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11856 AreVectorizableGathers(VectorizableTree[0].
get(),
11857 VectorizableTree[0]->Scalars.size()) &&
11858 VectorizableTree[0]->getVectorFactor() > 2)))
11861 if (VectorizableTree.size() != 2)
11869 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11870 AreVectorizableGathers(VectorizableTree[1].
get(),
11871 VectorizableTree[0]->Scalars.size()))
11875 if (VectorizableTree[0]->
isGather() ||
11876 (VectorizableTree[1]->isGather() &&
11877 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11878 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11886 bool MustMatchOrInst) {
11890 Value *ZextLoad = Root;
11891 const APInt *ShAmtC;
11892 bool FoundOr =
false;
11893 while (!isa<ConstantExpr>(ZextLoad) &&
11896 ShAmtC->
urem(8) == 0))) {
11897 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11898 ZextLoad = BinOp->getOperand(0);
11899 if (BinOp->getOpcode() == Instruction::Or)
11904 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11911 Type *SrcTy = Load->getType();
11918 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
11919 << *(cast<Instruction>(Root)) <<
"\n");
11928 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11929 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11937 unsigned NumElts = Stores.
size();
11938 for (
Value *Scalar : Stores) {
11952 if (VectorizableTree.empty()) {
11953 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
11959 if (VectorizableTree.size() == 2 &&
11960 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
11961 VectorizableTree[1]->isGather() &&
11962 (VectorizableTree[1]->getVectorFactor() <= 2 ||
11963 !(
isSplat(VectorizableTree[1]->Scalars) ||
11971 constexpr int Limit = 4;
11973 !VectorizableTree.empty() &&
11974 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11975 return (TE->isGather() &&
11976 TE->getOpcode() != Instruction::ExtractElement &&
11977 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
11978 TE->getOpcode() == Instruction::PHI;
11989 if (isFullyVectorizableTinyTree(ForReduction))
11994 bool IsAllowedSingleBVNode =
11995 VectorizableTree.size() > 1 ||
11996 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
11997 !VectorizableTree.front()->isAltShuffle() &&
11998 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
11999 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12001 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12002 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12003 return isa<ExtractElementInst, UndefValue>(V) ||
12004 (IsAllowedSingleBVNode &&
12005 !V->hasNUsesOrMore(UsesLimit) &&
12006 any_of(V->users(), IsaPred<InsertElementInst>));
12011 if (VectorizableTree.back()->isGather() &&
12012 VectorizableTree.back()->isAltShuffle() &&
12013 VectorizableTree.back()->getVectorFactor() > 2 &&
12015 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12017 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12018 VectorizableTree.back()->getVectorFactor()),
12031 constexpr unsigned SmallTree = 3;
12032 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12035 [](
const std::unique_ptr<TreeEntry> &TE) {
12036 return TE->isGather() &&
12037 TE->getOpcode() == Instruction::Load &&
12045 TreeEntry &E = *VectorizableTree[
Idx];
12048 if (E.getOpcode() && E.getOpcode() != Instruction::Load)
12062 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12075 for (
const auto &TEPtr : VectorizableTree) {
12076 if (TEPtr->State != TreeEntry::Vectorize)
12078 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12084 auto *NodeA = DT->
getNode(
A->getParent());
12085 auto *NodeB = DT->
getNode(
B->getParent());
12086 assert(NodeA &&
"Should only process reachable instructions");
12087 assert(NodeB &&
"Should only process reachable instructions");
12088 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12089 "Different nodes should have different DFS numbers");
12090 if (NodeA != NodeB)
12091 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12092 return B->comesBefore(
A);
12102 LiveValues.
erase(PrevInst);
12103 for (
auto &J : PrevInst->
operands()) {
12104 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12105 LiveValues.
insert(cast<Instruction>(&*J));
12109 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12110 for (
auto *
X : LiveValues)
12111 dbgs() <<
" " <<
X->getName();
12112 dbgs() <<
", Looking at ";
12117 unsigned NumCalls = 0;
12121 while (InstIt != PrevInstIt) {
12122 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12123 PrevInstIt = Inst->getParent()->rbegin();
12128 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12129 if (
II->isAssumeLikeIntrinsic())
12133 for (
auto &ArgOp :
II->args())
12134 Tys.push_back(ArgOp->getType());
12135 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12136 FMF = FPMO->getFastMathFlags();
12143 if (IntrCost < CallCost)
12150 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12151 &*PrevInstIt != PrevInst)
12159 for (
auto *
II : LiveValues) {
12160 auto *ScalarTy =
II->getType();
12161 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12162 ScalarTy = VectorTy->getElementType();
12180 const auto *I1 = IE1;
12181 const auto *I2 = IE2;
12193 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12195 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12196 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12198 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12199 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12206struct ValueSelect {
12207 template <
typename U>
12208 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12211 template <
typename U>
12212 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12230template <
typename T>
12236 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12238 auto VMIt = std::next(ShuffleMask.begin());
12241 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12243 if (!IsBaseUndef.
all()) {
12245 std::pair<T *, bool> Res =
12246 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12248 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12252 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12254 auto *V = ValueSelect::get<T *>(
Base);
12256 assert((!V || GetVF(V) == Mask.size()) &&
12257 "Expected base vector of VF number of elements.");
12258 Prev = Action(Mask, {
nullptr, Res.first});
12259 }
else if (ShuffleMask.size() == 1) {
12262 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12268 Prev = Action(Mask, {ShuffleMask.begin()->first});
12272 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12273 unsigned Vec2VF = GetVF(VMIt->first);
12274 if (Vec1VF == Vec2VF) {
12278 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12281 Mask[
I] = SecMask[
I] + Vec1VF;
12284 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12287 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12289 std::pair<T *, bool> Res2 =
12290 ResizeAction(VMIt->first, VMIt->second,
false);
12292 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12299 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12302 Prev = Action(Mask, {Res1.first, Res2.first});
12304 VMIt = std::next(VMIt);
12306 bool IsBaseNotUndef = !IsBaseUndef.
all();
12307 (void)IsBaseNotUndef;
12309 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12311 std::pair<T *, bool> Res =
12312 ResizeAction(VMIt->first, VMIt->second,
false);
12314 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12317 "Multiple uses of scalars.");
12318 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12323 Prev = Action(Mask, {Prev, Res.first});
12331template <
typename T>
struct ShuffledInsertData {
12342 << VectorizableTree.size() <<
".\n");
12344 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12347 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12348 TreeEntry &TE = *VectorizableTree[
I];
12351 if (TE.State == TreeEntry::CombinedVectorize) {
12353 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12354 << *TE.Scalars[0] <<
".\n";
12355 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12358 if (TE.isGather()) {
12359 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12360 E && E->getVectorFactor() == TE.getVectorFactor() &&
12361 E->isSame(TE.Scalars)) {
12366 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12373 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12374 "Expected gather nodes with users only.");
12380 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12389 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12396 for (ExternalUser &EU : ExternalUses) {
12397 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12399 for (ExternalUser &EU : ExternalUses) {
12403 if (EphValues.
count(EU.User))
12409 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12412 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12416 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12417 !ExtractCostCalculated.
insert(EU.Scalar).second)
12421 if (isa<FixedVectorType>(EU.Scalar->getType()))
12426 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12428 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12429 if (!UsedInserts.
insert(VU).second)
12433 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12436 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12441 Value *Op0 =
II->getOperand(0);
12442 if (getTreeEntry(
II) && !getTreeEntry(Op0))
12448 if (It == ShuffledInserts.
end()) {
12450 Data.InsertElements.emplace_back(VU);
12452 VecId = ShuffledInserts.
size() - 1;
12453 auto It = MinBWs.
find(ScalarTE);
12454 if (It != MinBWs.
end() &&
12456 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12458 unsigned BWSz = It->second.first;
12459 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12460 unsigned VecOpcode;
12461 if (DstBWSz < BWSz)
12462 VecOpcode = Instruction::Trunc;
12465 It->second.second ? Instruction::SExt : Instruction::ZExt;
12470 FTy->getNumElements()),
12473 <<
" for extending externally used vector with "
12474 "non-equal minimum bitwidth.\n");
12479 It->InsertElements.front() = VU;
12480 VecId = std::distance(ShuffledInserts.
begin(), It);
12482 int InIdx = *InsertIdx;
12484 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12487 Mask[InIdx] = EU.Lane;
12488 DemandedElts[VecId].setBit(InIdx);
12499 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12500 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12501 auto It = MinBWs.
find(Entry);
12502 if (It != MinBWs.
end()) {
12505 ? Instruction::ZExt
12506 : Instruction::SExt;
12513 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12516 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12517 Entry->getOpcode() == Instruction::Load) {
12519 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12520 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12521 auto *
I = cast<Instruction>(U.Scalar);
12522 const Loop *L = LI->getLoopFor(Phi->getParent());
12523 return L && (Phi->getParent() ==
I->getParent() ||
12524 L == LI->getLoopFor(
I->getParent()));
12528 if (!ValueToExtUses) {
12529 ValueToExtUses.emplace();
12532 if (IsPhiInLoop(
P.value()))
12535 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12540 auto *Inst = cast<Instruction>(EU.Scalar);
12542 auto OperandIsScalar = [&](
Value *V) {
12543 if (!getTreeEntry(V)) {
12547 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12548 return !EE->hasOneUse() || !MustGather.contains(EE);
12551 return ValueToExtUses->contains(V);
12553 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12554 bool CanBeUsedAsScalarCast =
false;
12555 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12556 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12557 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12559 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12562 if (ScalarCost + OpCost <= ExtraCost) {
12563 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12564 ScalarCost += OpCost;
12568 if (CanBeUsedAsScalar) {
12569 bool KeepScalar = ScalarCost <= ExtraCost;
12573 bool IsProfitablePHIUser =
12575 VectorizableTree.front()->Scalars.size() > 2)) &&
12576 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12580 auto *PHIUser = dyn_cast<PHINode>(U);
12581 return (!PHIUser ||
12582 PHIUser->getParent() !=
12584 VectorizableTree.front()->getMainOp())
12589 return ValueToExtUses->contains(V);
12591 if (IsProfitablePHIUser) {
12595 (!GatheredLoadsEntriesFirst.has_value() ||
12596 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12597 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12598 return ValueToExtUses->contains(V);
12600 auto It = ExtractsCount.
find(Entry);
12601 if (It != ExtractsCount.
end()) {
12602 assert(ScalarUsesCount >= It->getSecond().size() &&
12603 "Expected total number of external uses not less than "
12604 "number of scalar uses.");
12605 ScalarUsesCount -= It->getSecond().size();
12610 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12613 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12615 auto It = ValueToExtUses->find(V);
12616 if (It != ValueToExtUses->end()) {
12618 ExternalUses[It->second].User = nullptr;
12621 ExtraCost = ScalarCost;
12622 if (!IsPhiInLoop(EU))
12623 ExtractsCount[Entry].
insert(Inst);
12624 if (CanBeUsedAsScalarCast) {
12625 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12628 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12630 auto It = ValueToExtUses->find(V);
12631 if (It != ValueToExtUses->end()) {
12633 ExternalUses[It->second].User = nullptr;
12642 ExtractCost += ExtraCost;
12646 for (
Value *V : ScalarOpsFromCasts) {
12647 ExternalUsesAsOriginalScalar.
insert(V);
12648 if (
const TreeEntry *E = getTreeEntry(V)) {
12649 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12653 if (!VectorizedVals.
empty()) {
12654 const TreeEntry &Root = *VectorizableTree.front();
12655 auto BWIt = MinBWs.find(&Root);
12656 if (BWIt != MinBWs.end()) {
12657 Type *DstTy = Root.Scalars.front()->getType();
12660 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12661 if (OriginalSz != SrcSz) {
12662 unsigned Opcode = Instruction::Trunc;
12663 if (OriginalSz > SrcSz)
12664 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12666 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12678 Cost += SpillCost + ExtractCost;
12682 unsigned VF =
Mask.size();
12683 unsigned VecVF =
TE->getVectorFactor();
12685 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12688 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12694 dbgs() <<
"SLP: Adding cost " <<
C
12695 <<
" for final shuffle of insertelement external users.\n";
12696 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12698 return std::make_pair(TE,
true);
12700 return std::make_pair(TE,
false);
12703 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12704 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12705 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12709 assert((TEs.size() == 1 || TEs.size() == 2) &&
12710 "Expected exactly 1 or 2 tree entries.");
12711 if (TEs.size() == 1) {
12713 VF = TEs.front()->getVectorFactor();
12714 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12718 (
Data.index() < VF &&
12719 static_cast<int>(
Data.index()) ==
Data.value());
12724 <<
" for final shuffle of insertelement "
12725 "external users.\n";
12726 TEs.front()->
dump();
12727 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12733 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12734 VF = TEs.front()->getVectorFactor();
12738 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12742 <<
" for final shuffle of vector node and external "
12743 "insertelement users.\n";
12744 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12745 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12751 (void)performExtractsShuffleAction<const TreeEntry>(
12753 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12754 EstimateShufflesCost);
12756 cast<FixedVectorType>(
12757 ShuffledInserts[
I].InsertElements.front()->getType()),
12760 Cost -= InsertCost;
12764 if (ReductionBitWidth != 0) {
12765 assert(UserIgnoreList &&
"Expected reduction tree.");
12766 const TreeEntry &E = *VectorizableTree.front();
12767 auto It = MinBWs.find(&E);
12768 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12769 unsigned SrcSize = It->second.first;
12770 unsigned DstSize = ReductionBitWidth;
12771 unsigned Opcode = Instruction::Trunc;
12772 if (SrcSize < DstSize) {
12773 bool IsArithmeticExtendedReduction =
12775 auto *
I = cast<Instruction>(V);
12776 return is_contained({Instruction::Add, Instruction::FAdd,
12777 Instruction::Mul, Instruction::FMul,
12778 Instruction::And, Instruction::Or,
12782 if (IsArithmeticExtendedReduction)
12784 Instruction::BitCast;
12786 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12788 if (Opcode != Instruction::BitCast) {
12790 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12792 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12795 switch (E.getOpcode()) {
12796 case Instruction::SExt:
12797 case Instruction::ZExt:
12798 case Instruction::Trunc: {
12799 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12800 CCH = getCastContextHint(*OpTE);
12810 <<
" for final resize for reduction from " << SrcVecTy
12811 <<
" to " << DstVecTy <<
"\n";
12812 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12821 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12822 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12823 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12827 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12838std::optional<TTI::ShuffleKind>
12839BoUpSLP::tryToGatherSingleRegisterExtractElements(
12845 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12846 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12848 if (isa<UndefValue>(VL[
I]))
12852 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12853 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12866 ExtractMask.reset(*
Idx);
12871 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
12876 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
12877 return P1.second.size() > P2.second.size();
12880 const int UndefSz = UndefVectorExtracts.
size();
12881 unsigned SingleMax = 0;
12882 unsigned PairMax = 0;
12883 if (!Vectors.
empty()) {
12884 SingleMax = Vectors.
front().second.size() + UndefSz;
12885 if (Vectors.
size() > 1) {
12886 auto *ItNext = std::next(Vectors.
begin());
12887 PairMax = SingleMax + ItNext->second.size();
12890 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12891 return std::nullopt;
12897 if (SingleMax >= PairMax && SingleMax) {
12898 for (
int Idx : Vectors.
front().second)
12900 }
else if (!Vectors.
empty()) {
12901 for (
unsigned Idx : {0, 1})
12902 for (
int Idx : Vectors[
Idx].second)
12906 for (
int Idx : UndefVectorExtracts)
12910 std::optional<TTI::ShuffleKind> Res =
12916 return std::nullopt;
12920 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
12921 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
12922 isa<UndefValue>(GatheredExtracts[
I])) {
12926 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12927 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
12928 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
12943 unsigned NumParts)
const {
12944 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
12948 for (
unsigned Part : seq<unsigned>(NumParts)) {
12954 std::optional<TTI::ShuffleKind> Res =
12955 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
12956 ShufflesRes[Part] = Res;
12957 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
12959 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
12960 return Res.has_value();
12962 ShufflesRes.clear();
12963 return ShufflesRes;
12966std::optional<TargetTransformInfo::ShuffleKind>
12967BoUpSLP::isGatherShuffledSingleRegisterEntry(
12973 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
12974 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
12975 :
TE->UserTreeIndices.front();
12976 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
12980 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
12981 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
12984 TEInsertBlock = TEInsertPt->
getParent();
12987 return std::nullopt;
12988 auto *NodeUI = DT->
getNode(TEInsertBlock);
12989 assert(NodeUI &&
"Should only process reachable instructions");
12991 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13005 auto *NodeEUI = DT->
getNode(InsertBlock);
13008 assert((NodeUI == NodeEUI) ==
13009 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13010 "Different nodes should have different DFS numbers");
13012 if (TEInsertPt->
getParent() != InsertBlock &&
13015 if (TEInsertPt->
getParent() == InsertBlock &&
13029 for (
Value *V : VL) {
13034 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13035 if (TEPtr == TE || TEPtr->Idx == 0)
13038 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13039 "Must contain at least single gathered value.");
13040 assert(TEPtr->UserTreeIndices.size() == 1 &&
13041 "Expected only single user of a gather node.");
13042 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13044 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13047 : &getLastInstructionInBundle(UseEI.UserTE);
13048 if (TEInsertPt == InsertPt) {
13052 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13056 if (TEUseEI.UserTE != UseEI.UserTE &&
13057 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13063 if ((TEInsertBlock != InsertPt->
getParent() ||
13064 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13065 !CheckOrdering(InsertPt))
13069 if (
const TreeEntry *VTE = getTreeEntry(V)) {
13070 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13071 if (VTE->State != TreeEntry::Vectorize) {
13072 auto It = MultiNodeScalars.
find(V);
13073 if (It == MultiNodeScalars.
end())
13075 VTE = *It->getSecond().begin();
13077 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13078 return MTE->State == TreeEntry::Vectorize;
13080 if (MIt == It->getSecond().end())
13085 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13086 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13090 if (VToTEs.
empty())
13092 if (UsedTEs.
empty()) {
13106 if (!VToTEs.
empty()) {
13112 VToTEs = SavedVToTEs;
13121 if (UsedTEs.
size() == 2)
13123 UsedTEs.push_back(SavedVToTEs);
13130 if (UsedTEs.
empty()) {
13132 return std::nullopt;
13136 if (UsedTEs.
size() == 1) {
13139 UsedTEs.front().
end());
13140 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13141 return TE1->Idx < TE2->Idx;
13144 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13145 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13147 if (It != FirstEntries.end() &&
13148 ((*It)->getVectorFactor() == VL.size() ||
13149 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13150 TE->ReuseShuffleIndices.size() == VL.size() &&
13151 (*It)->isSame(
TE->Scalars)))) {
13152 Entries.push_back(*It);
13153 if ((*It)->getVectorFactor() == VL.size()) {
13154 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13155 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13161 for (
unsigned I : seq<unsigned>(VL.size()))
13162 if (isa<PoisonValue>(VL[
I]))
13168 Entries.push_back(FirstEntries.front());
13171 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13174 for (
const TreeEntry *TE : UsedTEs.front()) {
13175 unsigned VF =
TE->getVectorFactor();
13176 auto It = VFToTE.
find(VF);
13177 if (It != VFToTE.
end()) {
13178 if (It->second->Idx >
TE->Idx)
13179 It->getSecond() =
TE;
13186 UsedTEs.back().
end());
13187 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13188 return TE1->Idx < TE2->Idx;
13190 for (
const TreeEntry *TE : SecondEntries) {
13191 auto It = VFToTE.
find(
TE->getVectorFactor());
13192 if (It != VFToTE.
end()) {
13194 Entries.push_back(It->second);
13195 Entries.push_back(TE);
13201 if (Entries.empty()) {
13203 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13204 return TE1->Idx < TE2->Idx;
13206 Entries.push_back(SecondEntries.front());
13207 VF = std::max(Entries.front()->getVectorFactor(),
13208 Entries.back()->getVectorFactor());
13212 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13215 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13216 auto *
PHI = cast<PHINode>(V);
13217 auto *PHI1 = cast<PHINode>(V1);
13222 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13224 Value *In1 = PHI1->getIncomingValue(
I);
13229 if (cast<Instruction>(In)->
getParent() !=
13239 auto MightBeIgnored = [=](
Value *
V) {
13240 auto *
I = dyn_cast<Instruction>(V);
13241 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13243 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13248 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13250 bool UsedInSameVTE =
false;
13251 auto It = UsedValuesEntry.
find(V1);
13252 if (It != UsedValuesEntry.
end())
13253 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13254 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13256 cast<Instruction>(V)->getParent() ==
13257 cast<Instruction>(V1)->getParent() &&
13258 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13263 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13265 auto It = UsedValuesEntry.
find(V);
13266 if (It == UsedValuesEntry.
end())
13272 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13273 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13275 unsigned Idx = It->second;
13282 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13283 if (!UsedIdxs.test(
I))
13289 for (std::pair<unsigned, int> &Pair : EntryLanes)
13290 if (Pair.first ==
I)
13291 Pair.first = TempEntries.
size();
13294 Entries.swap(TempEntries);
13295 if (EntryLanes.size() == Entries.size() &&
13297 .
slice(Part * VL.size(),
13298 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13304 return std::nullopt;
13307 bool IsIdentity = Entries.size() == 1;
13310 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13311 unsigned Idx = Part * VL.size() + Pair.second;
13314 (ForOrder ? std::distance(
13315 Entries[Pair.first]->Scalars.begin(),
13316 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13317 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13318 IsIdentity &=
Mask[
Idx] == Pair.second;
13320 switch (Entries.size()) {
13322 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13326 if (EntryLanes.size() > 2 || VL.size() <= 2)
13334 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13336 return std::nullopt;
13340BoUpSLP::isGatherShuffledEntry(
13344 assert(NumParts > 0 && NumParts < VL.
size() &&
13345 "Expected positive number of registers.");
13348 if (TE == VectorizableTree.front().get() &&
13349 (!GatheredLoadsEntriesFirst.has_value() ||
13351 [](
const std::unique_ptr<TreeEntry> &TE) {
13352 return !
TE->isGather();
13356 if (
TE->isNonPowOf2Vec())
13359 assert((
TE->UserTreeIndices.size() == 1 ||
13360 TE == VectorizableTree.front().get()) &&
13361 "Expected only single user of the gather node.");
13363 "Number of scalars must be divisible by NumParts.");
13364 if (!
TE->UserTreeIndices.empty() &&
13365 TE->UserTreeIndices.front().UserTE->isGather() &&
13366 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13367 assert((
TE->Idx == 0 ||
TE->getOpcode() == Instruction::ExtractElement ||
13369 "Expected splat or extractelements only node.");
13374 for (
unsigned Part : seq<unsigned>(NumParts)) {
13378 std::optional<TTI::ShuffleKind> SubRes =
13379 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13382 SubEntries.
clear();
13385 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13386 (SubEntries.
front()->isSame(
TE->Scalars) ||
13387 SubEntries.
front()->isSame(VL))) {
13389 LocalSubEntries.
swap(SubEntries);
13392 std::iota(
Mask.begin(),
Mask.end(), 0);
13394 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13395 if (isa<PoisonValue>(VL[
I]))
13397 Entries.emplace_back(1, LocalSubEntries.
front());
13403 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13411 Type *ScalarTy)
const {
13413 bool DuplicateNonConst =
false;
13421 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13422 if (
V->getType() != ScalarTy) {
13433 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13436 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13444 EstimateInsertCost(
I, V);
13445 ShuffleMask[
I] =
I;
13449 DuplicateNonConst =
true;
13451 ShuffleMask[
I] = Res.first->second;
13453 if (ForPoisonSrc) {
13454 if (isa<FixedVectorType>(ScalarTy)) {
13460 for (
unsigned I : seq<unsigned>(VL.
size()))
13461 if (!ShuffledElements[
I])
13464 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13472 if (DuplicateNonConst)
13474 VecTy, ShuffleMask);
13478Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13479 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13485 auto *Front = E->getMainOp();
13487 assert(((GatheredLoadsEntriesFirst.has_value() &&
13488 E->getOpcode() == Instruction::Load && E->isGather() &&
13489 E->Idx < *GatheredLoadsEntriesFirst) ||
13491 [=](
Value *V) ->
bool {
13492 if (E->getOpcode() == Instruction::GetElementPtr &&
13493 !isa<GetElementPtrInst>(V))
13495 auto *I = dyn_cast<Instruction>(V);
13496 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13497 isVectorLikeInstWithConstOps(I);
13499 "Expected gathered loads or GEPs or instructions from same basic "
13502 auto FindLastInst = [&]() {
13504 for (
Value *V : E->Scalars) {
13505 auto *
I = dyn_cast<Instruction>(V);
13508 if (LastInst->
getParent() ==
I->getParent()) {
13513 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13514 !isa<GetElementPtrInst>(
I)) ||
13517 (GatheredLoadsEntriesFirst.has_value() &&
13518 E->getOpcode() == Instruction::Load && E->isGather() &&
13519 E->Idx < *GatheredLoadsEntriesFirst)) &&
13520 "Expected vector-like or non-GEP in GEP node insts only.");
13528 auto *NodeB = DT->
getNode(
I->getParent());
13529 assert(NodeA &&
"Should only process reachable instructions");
13530 assert(NodeB &&
"Should only process reachable instructions");
13531 assert((NodeA == NodeB) ==
13532 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13533 "Different nodes should have different DFS numbers");
13534 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13541 auto FindFirstInst = [&]() {
13543 for (
Value *V : E->Scalars) {
13544 auto *
I = dyn_cast<Instruction>(V);
13547 if (FirstInst->
getParent() ==
I->getParent()) {
13548 if (
I->comesBefore(FirstInst))
13552 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13553 !isa<GetElementPtrInst>(
I)) ||
13556 "Expected vector-like or non-GEP in GEP node insts only.");
13564 auto *NodeB = DT->
getNode(
I->getParent());
13565 assert(NodeA &&
"Should only process reachable instructions");
13566 assert(NodeB &&
"Should only process reachable instructions");
13567 assert((NodeA == NodeB) ==
13568 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13569 "Different nodes should have different DFS numbers");
13570 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13577 if (GatheredLoadsEntriesFirst.has_value() &&
13578 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13579 E->getOpcode() == Instruction::Load) {
13580 Res = FindFirstInst();
13588 if ((E->getOpcode() == Instruction::GetElementPtr &&
13591 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13595 return isa<PoisonValue>(V) ||
13596 (!isVectorLikeInstWithConstOps(V) &&
13597 isUsedOutsideBlock(V));
13599 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13600 return isa<ExtractElementInst, UndefValue>(V) ||
13601 areAllOperandsNonInsts(V);
13603 Res = FindLastInst();
13605 Res = FindFirstInst();
13613 if (BlocksSchedules.count(BB) && !E->isGather()) {
13614 Value *
V = E->isOneOf(E->Scalars.back());
13617 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13618 if (Bundle && Bundle->isPartOfBundle())
13619 for (; Bundle; Bundle = Bundle->NextInBundle)
13620 Res = Bundle->Inst;
13642 Res = FindLastInst();
13643 assert(Res &&
"Failed to find last instruction in bundle");
13647void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13648 auto *Front = E->getMainOp();
13649 Instruction *LastInst = &getLastInstructionInBundle(E);
13650 assert(LastInst &&
"Failed to find last instruction in bundle");
13653 bool IsPHI = isa<PHINode>(LastInst);
13655 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13657 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13661 Builder.SetInsertPoint(
13665 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13668Value *BoUpSLP::gather(
13677 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13680 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13681 InsertBB = InsertBB->getSinglePredecessor();
13682 return InsertBB && InsertBB == InstBB;
13684 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13685 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13686 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13687 getTreeEntry(Inst) ||
13688 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13689 PostponedIndices.
insert(
I).second)
13693 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13696 if (
Scalar->getType() != Ty) {
13700 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13701 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13703 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13704 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13707 Scalar = Builder.CreateIntCast(
13712 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13714 Vec = InsElt = Builder.CreateInsertVector(
13717 auto *
II = dyn_cast<IntrinsicInst>(InsElt);
13718 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13721 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13722 InsElt = dyn_cast<InsertElementInst>(Vec);
13726 GatherShuffleExtractSeq.
insert(InsElt);
13729 if (isa<Instruction>(V)) {
13730 if (TreeEntry *Entry = getTreeEntry(V)) {
13732 User *UserOp =
nullptr;
13734 if (
auto *SI = dyn_cast<Instruction>(Scalar))
13740 unsigned FoundLane =
Entry->findLaneForValue(V);
13741 ExternalUses.emplace_back(V, UserOp, FoundLane);
13751 std::iota(
Mask.begin(),
Mask.end(), 0);
13752 Value *OriginalRoot = Root;
13753 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
13754 SV && isa<PoisonValue>(SV->getOperand(1)) &&
13755 SV->getOperand(0)->getType() == VecTy) {
13756 Root = SV->getOperand(0);
13757 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
13760 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13767 if (isa<PoisonValue>(VL[
I]))
13769 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
13773 if (isa<PoisonValue>(Vec)) {
13774 Vec = OriginalRoot;
13776 Vec = CreateShuffle(Root, Vec, Mask);
13777 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
13778 OI && OI->hasNUses(0) &&
13779 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13780 return TE->VectorizedValue == OI;
13786 for (
int I : NonConsts)
13787 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
13790 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
13791 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
13829 bool IsFinalized =
false;
13842 class ShuffleIRBuilder {
13855 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
13856 CSEBlocks(CSEBlocks),
DL(
DL) {}
13857 ~ShuffleIRBuilder() =
default;
13860 if (V1->
getType() != V2->getType()) {
13863 "Expected integer vector types only.");
13864 if (V1->
getType() != V2->getType()) {
13865 if (cast<VectorType>(V2->getType())
13867 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
13869 ->getIntegerBitWidth())
13878 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
13879 GatherShuffleExtractSeq.
insert(
I);
13880 CSEBlocks.
insert(
I->getParent());
13889 unsigned VF = Mask.size();
13890 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13894 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
13895 GatherShuffleExtractSeq.
insert(
I);
13896 CSEBlocks.
insert(
I->getParent());
13900 Value *createIdentity(
Value *V) {
return V; }
13901 Value *createPoison(
Type *Ty,
unsigned VF) {
13906 void resizeToMatch(
Value *&V1,
Value *&V2) {
13907 if (V1->
getType() == V2->getType())
13909 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
13910 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
13911 int VF = std::max(V1VF, V2VF);
13912 int MinVF = std::min(V1VF, V2VF);
13914 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
13916 Value *&
Op = MinVF == V1VF ? V1 : V2;
13918 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
13919 GatherShuffleExtractSeq.
insert(
I);
13920 CSEBlocks.
insert(
I->getParent());
13933 assert(V1 &&
"Expected at least one vector value.");
13934 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
13935 R.CSEBlocks, *R.DL);
13936 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
13944 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
13952 std::optional<bool> IsSigned = std::nullopt) {
13953 auto *VecTy = cast<VectorType>(V->getType());
13964 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
13968 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13969 unsigned NumParts,
bool &UseVecBaseAsInput) {
13970 UseVecBaseAsInput =
false;
13972 Value *VecBase =
nullptr;
13974 if (!E->ReorderIndices.empty()) {
13976 E->ReorderIndices.end());
13979 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
13983 auto *EI = cast<ExtractElementInst>(VL[
I]);
13984 VecBase = EI->getVectorOperand();
13985 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
13986 VecBase = TE->VectorizedValue;
13987 assert(VecBase &&
"Expected vectorized value.");
13988 UniqueBases.
insert(VecBase);
13991 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
13992 (NumParts != 1 &&
count(VL, EI) > 1) ||
13994 const TreeEntry *UTE = R.getTreeEntry(U);
13995 return !UTE || R.MultiNodeScalars.contains(U) ||
13996 (isa<GetElementPtrInst>(U) &&
13997 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
13998 count_if(R.VectorizableTree,
13999 [&](const std::unique_ptr<TreeEntry> &TE) {
14000 return any_of(TE->UserTreeIndices,
14001 [&](const EdgeInfo &Edge) {
14002 return Edge.UserTE == UTE;
14004 is_contained(VL, EI);
14008 R.eraseInstruction(EI);
14010 if (NumParts == 1 || UniqueBases.
size() == 1) {
14011 assert(VecBase &&
"Expected vectorized value.");
14012 return castToScalarTyElem(VecBase);
14014 UseVecBaseAsInput =
true;
14024 Value *Vec =
nullptr;
14027 for (
unsigned Part : seq<unsigned>(NumParts)) {
14031 constexpr int MaxBases = 2;
14033 auto VLMask =
zip(SubVL, SubMask);
14034 const unsigned VF = std::accumulate(
14035 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14036 if (std::get<1>(D) == PoisonMaskElem)
14039 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14040 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14041 VecOp = TE->VectorizedValue;
14042 assert(VecOp &&
"Expected vectorized value.");
14043 const unsigned Size =
14044 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14045 return std::max(S, Size);
14047 for (
const auto [V,
I] : VLMask) {
14050 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14051 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14052 VecOp = TE->VectorizedValue;
14053 assert(VecOp &&
"Expected vectorized value.");
14054 VecOp = castToScalarTyElem(VecOp);
14055 Bases[
I / VF] = VecOp;
14057 if (!Bases.front())
14060 if (Bases.back()) {
14061 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14062 TransformToIdentity(SubMask);
14064 SubVec = Bases.front();
14071 Mask.slice(
P * SliceSize,
14078 "Expected first part or all previous parts masked.");
14079 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14082 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14084 unsigned SubVecVF =
14085 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14086 NewVF = std::max(NewVF, SubVecVF);
14089 for (
int &
Idx : SubMask)
14092 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14093 Vec = createShuffle(Vec, SubVec, VecMask);
14094 TransformToIdentity(VecMask);
14102 std::optional<Value *>
14108 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14110 return std::nullopt;
14113 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14122 Value *V1 = E1.VectorizedValue;
14124 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14125 if (isa<PoisonValue>(V))
14127 return !isKnownNonNegative(
14128 V, SimplifyQuery(*R.DL));
14130 Value *V2 = E2.VectorizedValue;
14131 if (V2->getType()->isIntOrIntVectorTy())
14132 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14133 if (isa<PoisonValue>(V))
14135 return !isKnownNonNegative(
14136 V, SimplifyQuery(*R.DL));
14143 Value *V1 = E1.VectorizedValue;
14145 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14146 if (isa<PoisonValue>(V))
14148 return !isKnownNonNegative(
14149 V, SimplifyQuery(*R.DL));
14155 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14157 isa<FixedVectorType>(V2->getType()) &&
14158 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14159 V1 = castToScalarTyElem(V1);
14160 V2 = castToScalarTyElem(V2);
14161 if (InVectors.
empty()) {
14164 CommonMask.
assign(Mask.begin(), Mask.end());
14168 if (InVectors.
size() == 2) {
14169 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14170 transformMaskAfterShuffle(CommonMask, CommonMask);
14171 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14173 Vec = createShuffle(Vec,
nullptr, CommonMask);
14174 transformMaskAfterShuffle(CommonMask, CommonMask);
14176 V1 = createShuffle(V1, V2, Mask);
14177 unsigned VF = std::max(getVF(V1), getVF(Vec));
14178 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14180 CommonMask[
Idx] =
Idx + VF;
14181 InVectors.
front() = Vec;
14182 if (InVectors.
size() == 2)
14183 InVectors.
back() = V1;
14190 "castToScalarTyElem expects V1 to be FixedVectorType");
14191 V1 = castToScalarTyElem(V1);
14192 if (InVectors.
empty()) {
14194 CommonMask.
assign(Mask.begin(), Mask.end());
14197 const auto *It =
find(InVectors, V1);
14198 if (It == InVectors.
end()) {
14199 if (InVectors.
size() == 2 ||
14202 if (InVectors.
size() == 2) {
14203 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14204 transformMaskAfterShuffle(CommonMask, CommonMask);
14205 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14206 CommonMask.
size()) {
14207 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14208 transformMaskAfterShuffle(CommonMask, CommonMask);
14210 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14211 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14214 V->getType() != V1->
getType()
14216 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14217 ->getNumElements();
14218 if (V->getType() != V1->
getType())
14219 V1 = createShuffle(V1,
nullptr, Mask);
14220 InVectors.
front() = V;
14221 if (InVectors.
size() == 2)
14222 InVectors.
back() = V1;
14229 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14235 int VF = getVF(V1);
14236 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14238 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.
begin() ? 0 : VF);
14247 Value *Root =
nullptr) {
14248 return R.gather(VL, Root, ScalarTy,
14250 return createShuffle(V1, V2, Mask);
14259 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14262 IsFinalized =
true;
14265 if (ScalarTyNumElements != 1) {
14269 ExtMask = NewExtMask;
14273 if (InVectors.
size() == 2) {
14274 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14277 Vec = createShuffle(Vec,
nullptr, CommonMask);
14279 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14283 "Expected vector length for the final value before action.");
14284 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14287 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14288 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14290 Action(Vec, CommonMask);
14291 InVectors.
front() = Vec;
14293 if (!SubVectors.empty()) {
14295 if (InVectors.
size() == 2) {
14296 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14299 Vec = createShuffle(Vec,
nullptr, CommonMask);
14301 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14304 auto CreateSubVectors = [&](
Value *Vec,
14306 for (
auto [E,
Idx] : SubVectors) {
14307 Value *
V = E->VectorizedValue;
14308 if (
V->getType()->isIntOrIntVectorTy())
14309 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14310 if (isa<PoisonValue>(V))
14312 return !isKnownNonNegative(
14313 V, SimplifyQuery(*R.DL));
14315 unsigned InsertionIndex =
Idx * ScalarTyNumElements;
14316 const unsigned SubVecVF =
14317 cast<FixedVectorType>(
V->getType())->getNumElements();
14318 if (InsertionIndex % SubVecVF == 0) {
14320 Builder.
getInt64(InsertionIndex));
14324 const unsigned VecVF =
14325 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14327 std::iota(
Mask.begin(),
Mask.end(), 0);
14328 for (
unsigned I : seq<unsigned>(
14329 InsertionIndex, (
Idx + SubVecVF) * ScalarTyNumElements))
14331 Vec = createShuffle(Vec, V, Mask);
14333 if (!CommonMask.
empty()) {
14335 std::next(CommonMask.
begin(), InsertionIndex),
14336 std::next(CommonMask.
begin(),
14337 (
Idx + E->getVectorFactor()) * ScalarTyNumElements),
14343 if (SubVectorsMask.
empty()) {
14344 Vec = CreateSubVectors(Vec, CommonMask);
14347 copy(SubVectorsMask, SVMask.begin());
14348 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14351 I1 = I2 + CommonMask.
size();
14356 Vec = createShuffle(InsertVec, Vec, SVMask);
14357 for (
unsigned I : seq<unsigned>(CommonMask.
size())) {
14362 InVectors.
front() = Vec;
14365 if (!ExtMask.
empty()) {
14366 if (CommonMask.
empty()) {
14370 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14373 NewMask[
I] = CommonMask[ExtMask[
I]];
14375 CommonMask.
swap(NewMask);
14378 if (CommonMask.
empty()) {
14379 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14380 return InVectors.
front();
14382 if (InVectors.
size() == 2)
14383 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14384 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14389 "Shuffle construction must be finalized.");
14393BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14394 unsigned NodeIdx) {
14398 if (!S.getOpcode() && VL.
front()->getType()->isPointerTy()) {
14399 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14400 if (It != VL.
end())
14403 if (!S.getOpcode())
14405 auto CheckSameVE = [&](
const TreeEntry *VE) {
14406 return VE->isSame(VL) &&
14407 (
any_of(VE->UserTreeIndices,
14408 [E, NodeIdx](
const EdgeInfo &EI) {
14409 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14411 any_of(VectorizableTree,
14412 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14413 return TE->isOperandGatherNode(
14414 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14415 VE->isSame(TE->Scalars);
14418 TreeEntry *VE = getTreeEntry(S.getMainOp());
14419 if (VE && CheckSameVE(VE))
14421 auto It = MultiNodeScalars.
find(S.getMainOp());
14422 if (It != MultiNodeScalars.
end()) {
14423 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14424 return TE != VE && CheckSameVE(TE);
14426 if (
I != It->getSecond().end())
14432Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14433 bool PostponedPHIs) {
14434 ValueList &VL = E->getOperand(NodeIdx);
14435 const unsigned VF = VL.size();
14436 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14441 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14443 ShuffleInstructionBuilder ShuffleBuilder(
14447 ShuffleBuilder.add(V, Mask);
14449 E->CombinedEntriesWithIndices.size());
14450 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14451 [&](
const auto &
P) {
14452 return std::make_pair(VectorizableTree[P.first].get(),
14455 assert((E->CombinedEntriesWithIndices.empty() ||
14456 E->ReorderIndices.empty()) &&
14457 "Expected either combined subnodes or reordering");
14458 return ShuffleBuilder.finalize({}, SubVectors, {});
14462 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14463 if (!VE->ReuseShuffleIndices.empty()) {
14484 if (isa<PoisonValue>(V))
14486 Mask[
I] = VE->findLaneForValue(V);
14488 V = FinalShuffle(V, Mask);
14490 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14491 "Expected vectorization factor less "
14492 "than original vector size.");
14494 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14495 V = FinalShuffle(V, UniformMask);
14501 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14502 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14503 }) == VE->UserTreeIndices.end()) {
14505 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14506 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14507 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14509 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14510 (*It)->VectorizedValue =
V;
14518 auto *
I =
find_if(VectorizableTree,
14519 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14520 return TE->isOperandGatherNode({E, NodeIdx});
14522 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14523 assert(
I->get()->UserTreeIndices.size() == 1 &&
14524 "Expected only single user for the gather node.");
14525 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14529template <
typename BVTy,
typename ResTy,
typename...
Args>
14530ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14532 assert(E->isGather() &&
"Expected gather node.");
14533 unsigned VF = E->getVectorFactor();
14535 bool NeedFreeze =
false;
14537 E->ReuseShuffleIndices.end());
14540 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14542 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14545 E->CombinedEntriesWithIndices.size());
14546 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14547 [&](
const auto &
P) {
14548 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14553 E->ReorderIndices.end());
14554 if (!ReorderMask.empty())
14560 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14561 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14562 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14565 SubVectorsMask.
clear();
14569 unsigned I,
unsigned SliceSize,
14570 bool IsNotPoisonous) {
14572 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14575 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14576 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14577 if (UserTE->getNumOperands() != 2)
14579 if (!IsNotPoisonous) {
14581 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14582 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14583 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14584 }) !=
TE->UserTreeIndices.end();
14586 if (It == VectorizableTree.end())
14589 if (!(*It)->ReorderIndices.empty()) {
14593 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14594 Value *V0 = std::get<0>(
P);
14595 Value *V1 = std::get<1>(
P);
14596 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14597 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14603 if ((
Mask.size() < InputVF &&
14606 (
Mask.size() == InputVF &&
14609 std::next(
Mask.begin(),
I * SliceSize),
14610 std::next(
Mask.begin(),
14617 std::next(
Mask.begin(),
I * SliceSize),
14618 std::next(
Mask.begin(),
14624 BVTy ShuffleBuilder(ScalarTy, Params...);
14625 ResTy Res = ResTy();
14629 Value *ExtractVecBase =
nullptr;
14630 bool UseVecBaseAsInput =
false;
14633 Type *OrigScalarTy = GatheredScalars.front()->getType();
14636 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14641 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14643 bool Resized =
false;
14645 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14646 if (!ExtractShuffles.
empty()) {
14651 if (
const auto *TE = getTreeEntry(
14652 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14655 if (std::optional<ResTy> Delayed =
14656 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14658 PostponedGathers.
insert(E);
14663 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14664 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14665 ExtractVecBase = VecBase;
14666 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14667 if (VF == VecBaseTy->getNumElements() &&
14668 GatheredScalars.size() != VF) {
14670 GatheredScalars.append(VF - GatheredScalars.size(),
14676 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
14677 ((E->getOpcode() == Instruction::Load ||
14678 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14681 return isa<LoadInst>(V) && getTreeEntry(V);
14683 E->isAltShuffle() ||
14684 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14686 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14688 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14690 if (!GatherShuffles.
empty()) {
14691 if (std::optional<ResTy> Delayed =
14692 ShuffleBuilder.needToDelay(E, Entries)) {
14694 PostponedGathers.
insert(E);
14699 if (GatherShuffles.
size() == 1 &&
14701 Entries.front().front()->isSame(E->Scalars)) {
14704 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14707 Mask.resize(E->Scalars.size());
14708 const TreeEntry *FrontTE = Entries.front().front();
14709 if (FrontTE->ReorderIndices.empty() &&
14710 ((FrontTE->ReuseShuffleIndices.empty() &&
14711 E->Scalars.size() == FrontTE->Scalars.size()) ||
14712 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14713 std::iota(
Mask.begin(),
Mask.end(), 0);
14716 if (isa<PoisonValue>(V)) {
14720 Mask[
I] = FrontTE->findLaneForValue(V);
14723 ShuffleBuilder.add(*FrontTE, Mask);
14724 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors,
14729 if (GatheredScalars.size() != VF &&
14731 return any_of(TEs, [&](
const TreeEntry *TE) {
14732 return TE->getVectorFactor() == VF;
14735 GatheredScalars.append(VF - GatheredScalars.size(),
14739 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
14747 bool IsRootPoison) {
14750 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
14757 int NumNonConsts = 0;
14760 if (isa<UndefValue>(V)) {
14761 if (!isa<PoisonValue>(V)) {
14776 Scalars.
front() = OrigV;
14779 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
14780 Scalars[Res.first->second] = OrigV;
14781 ReuseMask[
I] = Res.first->second;
14784 if (NumNonConsts == 1) {
14789 if (!UndefPos.
empty() && UndefPos.
front() == 0)
14792 ReuseMask[SinglePos] = SinglePos;
14793 }
else if (!UndefPos.
empty() && IsSplat) {
14798 return !isa<UndefValue>(V) &&
14800 (E->UserTreeIndices.size() == 1 &&
14804 return E->UserTreeIndices.front().EdgeIdx !=
14805 U.getOperandNo() &&
14807 E->UserTreeIndices.front().UserTE->Scalars,
14811 if (It != Scalars.
end()) {
14813 int Pos = std::distance(Scalars.
begin(), It);
14814 for (
int I : UndefPos) {
14816 ReuseMask[
I] = Pos;
14825 for (
int I : UndefPos) {
14827 if (isa<UndefValue>(Scalars[
I]))
14834 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
14835 bool IsNonPoisoned =
true;
14836 bool IsUsedInExpr =
true;
14837 Value *Vec1 =
nullptr;
14838 if (!ExtractShuffles.
empty()) {
14842 Value *Vec2 =
nullptr;
14843 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
14847 if (UseVecBaseAsInput) {
14848 Vec1 = ExtractVecBase;
14850 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
14853 if (isa<UndefValue>(E->Scalars[
I]))
14855 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
14856 Value *VecOp = EI->getVectorOperand();
14857 if (
const auto *TE = getTreeEntry(VecOp))
14858 if (
TE->VectorizedValue)
14859 VecOp =
TE->VectorizedValue;
14862 }
else if (Vec1 != VecOp) {
14863 assert((!Vec2 || Vec2 == VecOp) &&
14864 "Expected only 1 or 2 vectors shuffle.");
14870 IsUsedInExpr =
false;
14873 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
14876 IsUsedInExpr &= FindReusedSplat(
14878 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
14879 ExtractMask.size(), IsNotPoisonedVec);
14880 ShuffleBuilder.add(Vec1, ExtractMask,
true);
14881 IsNonPoisoned &= IsNotPoisonedVec;
14883 IsUsedInExpr =
false;
14888 if (!GatherShuffles.
empty()) {
14891 for (
const auto [
I, TEs] :
enumerate(Entries)) {
14894 "No shuffles with empty entries list expected.");
14898 "Expected shuffle of 1 or 2 entries.");
14902 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
14903 if (TEs.
size() == 1) {
14904 bool IsNotPoisonedVec =
14905 TEs.
front()->VectorizedValue
14909 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
14910 SliceSize, IsNotPoisonedVec);
14911 ShuffleBuilder.add(*TEs.
front(), VecMask);
14912 IsNonPoisoned &= IsNotPoisonedVec;
14914 IsUsedInExpr =
false;
14915 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
14916 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
14927 int EMSz = ExtractMask.size();
14928 int MSz =
Mask.size();
14931 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
14932 bool IsIdentityShuffle =
14933 ((UseVecBaseAsInput ||
14935 [](
const std::optional<TTI::ShuffleKind> &SK) {
14939 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
14941 (!GatherShuffles.
empty() &&
14943 [](
const std::optional<TTI::ShuffleKind> &SK) {
14947 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
14949 bool EnoughConstsForShuffle =
14953 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14957 return isa<Constant>(V) && !isa<UndefValue>(V);
14959 (!IsIdentityShuffle ||
14960 (GatheredScalars.size() == 2 &&
14962 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
14964 return isa<Constant>(V) && !isa<PoisonValue>(V);
14968 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
14969 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
14975 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
14977 TryPackScalars(GatheredScalars, BVMask,
true);
14978 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
14979 ShuffleBuilder.add(BV, BVMask);
14982 return isa<PoisonValue>(V) ||
14983 (IsSingleShuffle && ((IsIdentityShuffle &&
14984 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
14986 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
14989 Res = ShuffleBuilder.finalize(
14990 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
14992 TryPackScalars(NonConstants, Mask,
false);
14993 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
14998 TryPackScalars(GatheredScalars, ReuseMask,
true);
14999 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15000 ShuffleBuilder.add(BV, ReuseMask);
15001 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15006 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15007 if (!isa<PoisonValue>(V))
15010 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15011 ShuffleBuilder.add(BV, Mask);
15012 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15017 Res = ShuffleBuilder.createFreeze(Res);
15021Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15022 bool PostponedPHIs) {
15023 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15025 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15033 for (
Value *V : VL)
15034 if (isa<Instruction>(V))
15042 if (E->VectorizedValue &&
15043 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15044 E->isAltShuffle())) {
15045 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15046 return E->VectorizedValue;
15049 Value *
V = E->Scalars.front();
15050 Type *ScalarTy =
V->getType();
15051 if (!isa<CmpInst>(V))
15053 auto It = MinBWs.
find(E);
15054 if (It != MinBWs.
end()) {
15055 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15061 if (E->isGather()) {
15063 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
15064 setInsertPointAfterBundle(E);
15065 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15066 E->VectorizedValue = Vec;
15070 bool IsReverseOrder =
15071 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15072 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15073 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15074 if (E->getOpcode() == Instruction::Store &&
15075 E->State == TreeEntry::Vectorize) {
15077 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15078 E->ReorderIndices.size());
15079 ShuffleBuilder.add(V, Mask);
15080 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15081 ShuffleBuilder.addOrdered(V, {});
15083 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15086 E->CombinedEntriesWithIndices.size());
15088 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15089 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15092 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15093 "Expected either combined subnodes or reordering");
15094 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15097 assert(!E->isGather() &&
"Unhandled state");
15098 unsigned ShuffleOrOp =
15099 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15101 auto GetOperandSignedness = [&](
unsigned Idx) {
15102 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15103 bool IsSigned =
false;
15104 auto It = MinBWs.
find(OpE);
15105 if (It != MinBWs.
end())
15106 IsSigned = It->second.second;
15109 if (isa<PoisonValue>(V))
15111 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15115 switch (ShuffleOrOp) {
15116 case Instruction::PHI: {
15117 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15118 E != VectorizableTree.front().get() ||
15119 !E->UserTreeIndices.empty()) &&
15120 "PHI reordering is free.");
15121 if (PostponedPHIs && E->VectorizedValue)
15122 return E->VectorizedValue;
15123 auto *PH = cast<PHINode>(VL0);
15125 PH->getParent()->getFirstNonPHIIt());
15127 if (PostponedPHIs || !E->VectorizedValue) {
15134 PH->getParent()->getFirstInsertionPt());
15137 V = FinalShuffle(V, E);
15139 E->VectorizedValue =
V;
15143 PHINode *NewPhi = cast<PHINode>(E->PHI);
15152 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15158 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15162 if (!VisitedBBs.
insert(IBB).second) {
15169 Value *Vec = vectorizeOperand(E,
I,
true);
15170 if (VecTy != Vec->
getType()) {
15172 MinBWs.
contains(getOperandEntry(E,
I))) &&
15173 "Expected item in MinBWs.");
15174 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15180 "Invalid number of incoming values");
15181 assert(E->VectorizedValue &&
"Expected vectorized value.");
15182 return E->VectorizedValue;
15185 case Instruction::ExtractElement: {
15186 Value *
V = E->getSingleOperand(0);
15187 if (
const TreeEntry *TE = getTreeEntry(V))
15188 V =
TE->VectorizedValue;
15189 setInsertPointAfterBundle(E);
15190 V = FinalShuffle(V, E);
15191 E->VectorizedValue =
V;
15194 case Instruction::ExtractValue: {
15195 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15200 NewV = FinalShuffle(NewV, E);
15201 E->VectorizedValue = NewV;
15204 case Instruction::InsertElement: {
15205 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15207 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15209 Type *ScalarTy =
Op.front()->getType();
15210 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15212 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15213 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15218 cast<FixedVectorType>(
V->getType())->getNumElements()),
15223 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15224 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15226 const unsigned NumElts =
15227 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15228 const unsigned NumScalars = E->Scalars.size();
15231 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15235 if (!E->ReorderIndices.empty()) {
15240 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15243 bool IsIdentity =
true;
15245 Mask.swap(PrevMask);
15246 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15249 IsIdentity &= InsertIdx -
Offset ==
I;
15252 if (!IsIdentity || NumElts != NumScalars) {
15256 if (NumElts != NumScalars &&
Offset == 0) {
15265 InsertMask[*InsertIdx] = *InsertIdx;
15266 if (!
Ins->hasOneUse())
15268 Ins = dyn_cast_or_null<InsertElementInst>(
15269 Ins->getUniqueUndroppableUser());
15272 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15274 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15277 if (!IsFirstPoison.
all()) {
15279 for (
unsigned I = 0;
I < NumElts;
I++) {
15281 IsFirstUndef.
test(
I)) {
15282 if (IsVNonPoisonous) {
15283 InsertMask[
I] =
I < NumScalars ?
I : 0;
15288 if (
Idx >= NumScalars)
15289 Idx = NumScalars - 1;
15290 InsertMask[
I] = NumScalars +
Idx;
15304 if (
auto *
I = dyn_cast<Instruction>(V)) {
15305 GatherShuffleExtractSeq.
insert(
I);
15306 CSEBlocks.
insert(
I->getParent());
15311 for (
unsigned I = 0;
I < NumElts;
I++) {
15316 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15319 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15320 NumElts != NumScalars) {
15321 if (IsFirstUndef.
all()) {
15324 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15325 if (!IsFirstPoison.
all()) {
15326 for (
unsigned I = 0;
I < NumElts;
I++) {
15328 InsertMask[
I] =
I + NumElts;
15335 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15336 if (
auto *
I = dyn_cast<Instruction>(V)) {
15337 GatherShuffleExtractSeq.
insert(
I);
15338 CSEBlocks.
insert(
I->getParent());
15343 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15344 for (
unsigned I = 0;
I < NumElts;
I++) {
15348 InsertMask[
I] += NumElts;
15351 FirstInsert->getOperand(0), V, InsertMask,
15352 cast<Instruction>(E->Scalars.back())->getName());
15353 if (
auto *
I = dyn_cast<Instruction>(V)) {
15354 GatherShuffleExtractSeq.
insert(
I);
15355 CSEBlocks.
insert(
I->getParent());
15360 ++NumVectorInstructions;
15361 E->VectorizedValue =
V;
15364 case Instruction::ZExt:
15365 case Instruction::SExt:
15366 case Instruction::FPToUI:
15367 case Instruction::FPToSI:
15368 case Instruction::FPExt:
15369 case Instruction::PtrToInt:
15370 case Instruction::IntToPtr:
15371 case Instruction::SIToFP:
15372 case Instruction::UIToFP:
15373 case Instruction::Trunc:
15374 case Instruction::FPTrunc:
15375 case Instruction::BitCast: {
15376 setInsertPointAfterBundle(E);
15378 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15379 if (E->VectorizedValue) {
15380 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15381 return E->VectorizedValue;
15384 auto *CI = cast<CastInst>(VL0);
15386 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15387 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15389 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15392 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15393 if (SrcIt != MinBWs.
end())
15394 SrcBWSz = SrcIt->second.first;
15396 if (BWSz == SrcBWSz) {
15397 VecOpcode = Instruction::BitCast;
15398 }
else if (BWSz < SrcBWSz) {
15399 VecOpcode = Instruction::Trunc;
15400 }
else if (It != MinBWs.
end()) {
15401 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15402 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15403 }
else if (SrcIt != MinBWs.
end()) {
15404 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15406 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15408 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15409 !SrcIt->second.second) {
15410 VecOpcode = Instruction::UIToFP;
15412 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15414 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15415 V = FinalShuffle(V, E);
15417 E->VectorizedValue =
V;
15418 ++NumVectorInstructions;
15421 case Instruction::FCmp:
15422 case Instruction::ICmp: {
15423 setInsertPointAfterBundle(E);
15425 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15426 if (E->VectorizedValue) {
15427 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15428 return E->VectorizedValue;
15430 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15431 if (E->VectorizedValue) {
15432 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15433 return E->VectorizedValue;
15435 if (
L->getType() !=
R->getType()) {
15437 getOperandEntry(E, 1)->
isGather() ||
15438 MinBWs.
contains(getOperandEntry(E, 0)) ||
15439 MinBWs.
contains(getOperandEntry(E, 1))) &&
15440 "Expected item in MinBWs.");
15441 if (cast<VectorType>(
L->getType())
15443 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15445 ->getIntegerBitWidth()) {
15446 Type *CastTy =
R->getType();
15449 Type *CastTy =
L->getType();
15458 VecTy = cast<FixedVectorType>(
V->getType());
15459 V = FinalShuffle(V, E);
15461 E->VectorizedValue =
V;
15462 ++NumVectorInstructions;
15465 case Instruction::Select: {
15466 setInsertPointAfterBundle(E);
15468 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15469 if (E->VectorizedValue) {
15470 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15471 return E->VectorizedValue;
15473 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15474 if (E->VectorizedValue) {
15475 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15476 return E->VectorizedValue;
15478 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15479 if (E->VectorizedValue) {
15480 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15481 return E->VectorizedValue;
15485 getOperandEntry(E, 2)->
isGather() ||
15486 MinBWs.
contains(getOperandEntry(E, 1)) ||
15487 MinBWs.
contains(getOperandEntry(E, 2))) &&
15488 "Expected item in MinBWs.");
15489 if (True->
getType() != VecTy)
15490 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15491 if (False->
getType() != VecTy)
15492 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15497 assert(TrueNumElements >= CondNumElements &&
15498 TrueNumElements % CondNumElements == 0 &&
15499 "Cannot vectorize Instruction::Select");
15501 "Cannot vectorize Instruction::Select");
15502 if (CondNumElements != TrueNumElements) {
15510 "Cannot vectorize Instruction::Select");
15512 V = FinalShuffle(V, E);
15514 E->VectorizedValue =
V;
15515 ++NumVectorInstructions;
15518 case Instruction::FNeg: {
15519 setInsertPointAfterBundle(E);
15521 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15523 if (E->VectorizedValue) {
15524 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15525 return E->VectorizedValue;
15531 if (
auto *
I = dyn_cast<Instruction>(V))
15534 V = FinalShuffle(V, E);
15536 E->VectorizedValue =
V;
15537 ++NumVectorInstructions;
15541 case Instruction::Freeze: {
15542 setInsertPointAfterBundle(E);
15544 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15546 if (E->VectorizedValue) {
15547 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15548 return E->VectorizedValue;
15551 if (
Op->getType() != VecTy) {
15553 MinBWs.
contains(getOperandEntry(E, 0))) &&
15554 "Expected item in MinBWs.");
15558 V = FinalShuffle(V, E);
15560 E->VectorizedValue =
V;
15561 ++NumVectorInstructions;
15565 case Instruction::Add:
15566 case Instruction::FAdd:
15567 case Instruction::Sub:
15568 case Instruction::FSub:
15569 case Instruction::Mul:
15570 case Instruction::FMul:
15571 case Instruction::UDiv:
15572 case Instruction::SDiv:
15573 case Instruction::FDiv:
15574 case Instruction::URem:
15575 case Instruction::SRem:
15576 case Instruction::FRem:
15577 case Instruction::Shl:
15578 case Instruction::LShr:
15579 case Instruction::AShr:
15580 case Instruction::And:
15581 case Instruction::Or:
15582 case Instruction::Xor: {
15583 setInsertPointAfterBundle(E);
15585 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15586 if (E->VectorizedValue) {
15587 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15588 return E->VectorizedValue;
15590 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15591 if (E->VectorizedValue) {
15592 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15593 return E->VectorizedValue;
15595 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15596 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15599 auto *CI = dyn_cast<ConstantInt>(
Op);
15600 return CI && CI->getValue().countr_one() >= It->second.first;
15602 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15603 E->VectorizedValue =
V;
15604 ++NumVectorInstructions;
15611 getOperandEntry(E, 1)->
isGather() ||
15612 MinBWs.
contains(getOperandEntry(E, 0)) ||
15613 MinBWs.
contains(getOperandEntry(E, 1))) &&
15614 "Expected item in MinBWs.");
15625 if (
auto *
I = dyn_cast<Instruction>(V)) {
15628 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15630 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15632 I->setHasNoUnsignedWrap(
false);
15635 V = FinalShuffle(V, E);
15637 E->VectorizedValue =
V;
15638 ++NumVectorInstructions;
15642 case Instruction::Load: {
15645 setInsertPointAfterBundle(E);
15647 LoadInst *LI = cast<LoadInst>(VL0);
15650 if (E->State == TreeEntry::Vectorize) {
15652 }
else if (E->State == TreeEntry::StridedVectorize) {
15653 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15654 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15655 PO = IsReverseOrder ? PtrN : Ptr0;
15661 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15663 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15664 DL->getTypeAllocSize(ScalarTy));
15668 return cast<LoadInst>(V)->getPointerOperand();
15671 std::optional<Value *> Stride =
15680 (IsReverseOrder ? -1 : 1) *
15681 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15683 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15685 Intrinsic::experimental_vp_strided_load,
15686 {VecTy, PO->
getType(), StrideTy},
15688 Builder.
getInt32(E->Scalars.size())});
15694 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15695 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15696 if (E->VectorizedValue) {
15697 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15698 return E->VectorizedValue;
15700 if (isa<FixedVectorType>(ScalarTy)) {
15704 unsigned ScalarTyNumElements =
15705 cast<FixedVectorType>(ScalarTy)->getNumElements();
15706 unsigned VecTyNumElements =
15707 cast<FixedVectorType>(VecTy)->getNumElements();
15708 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15709 "Cannot expand getelementptr.");
15710 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15713 return Builder.getInt64(I % ScalarTyNumElements);
15722 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15727 V = FinalShuffle(V, E);
15728 E->VectorizedValue =
V;
15729 ++NumVectorInstructions;
15732 case Instruction::Store: {
15733 auto *
SI = cast<StoreInst>(VL0);
15735 setInsertPointAfterBundle(E);
15737 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
15738 if (VecValue->
getType() != VecTy)
15740 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
15741 VecValue = FinalShuffle(VecValue, E);
15745 if (E->State == TreeEntry::Vectorize) {
15748 assert(E->State == TreeEntry::StridedVectorize &&
15749 "Expected either strided or consecutive stores.");
15750 if (!E->ReorderIndices.empty()) {
15751 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
15752 Ptr =
SI->getPointerOperand();
15754 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
15755 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
15757 Intrinsic::experimental_vp_strided_store,
15758 {VecTy,
Ptr->getType(), StrideTy},
15761 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
15763 Builder.
getInt32(E->Scalars.size())});
15772 E->VectorizedValue =
V;
15773 ++NumVectorInstructions;
15776 case Instruction::GetElementPtr: {
15777 auto *GEP0 = cast<GetElementPtrInst>(VL0);
15778 setInsertPointAfterBundle(E);
15780 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
15781 if (E->VectorizedValue) {
15782 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15783 return E->VectorizedValue;
15787 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
15788 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
15789 if (E->VectorizedValue) {
15790 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15791 return E->VectorizedValue;
15796 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
15797 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
15799 for (
Value *V : E->Scalars) {
15800 if (isa<GetElementPtrInst>(V))
15806 V = FinalShuffle(V, E);
15808 E->VectorizedValue =
V;
15809 ++NumVectorInstructions;
15813 case Instruction::Call: {
15814 CallInst *CI = cast<CallInst>(VL0);
15815 setInsertPointAfterBundle(E);
15821 It != MinBWs.
end() ? It->second.first : 0,
TTI);
15824 VecCallCosts.first <= VecCallCosts.second;
15826 Value *ScalarArg =
nullptr;
15832 auto *CEI = cast<CallInst>(VL0);
15833 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
15838 ScalarArg = CEI->getArgOperand(
I);
15841 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
15842 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
15850 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
15851 if (E->VectorizedValue) {
15852 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15853 return E->VectorizedValue;
15855 ScalarArg = CEI->getArgOperand(
I);
15856 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
15858 It == MinBWs.
end()) {
15861 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
15862 }
else if (It != MinBWs.
end()) {
15863 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
15872 if (!UseIntrinsic) {
15888 V = FinalShuffle(V, E);
15890 E->VectorizedValue =
V;
15891 ++NumVectorInstructions;
15894 case Instruction::ShuffleVector: {
15896 if (
SLPReVec && !E->isAltShuffle()) {
15897 setInsertPointAfterBundle(E);
15898 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
15899 if (E->VectorizedValue) {
15900 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15901 return E->VectorizedValue;
15904 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
15905 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
15906 "Not supported shufflevector usage.");
15909 return SVSrc->getShuffleMask()[Mask];
15916 if (
auto *
I = dyn_cast<Instruction>(V))
15918 V = FinalShuffle(V, E);
15920 assert(E->isAltShuffle() &&
15925 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15926 "Invalid Shuffle Vector Operand");
15930 setInsertPointAfterBundle(E);
15931 LHS = vectorizeOperand(E, 0, PostponedPHIs);
15932 if (E->VectorizedValue) {
15933 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15934 return E->VectorizedValue;
15936 RHS = vectorizeOperand(E, 1, PostponedPHIs);
15938 setInsertPointAfterBundle(E);
15939 LHS = vectorizeOperand(E, 0, PostponedPHIs);
15941 if (E->VectorizedValue) {
15942 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15943 return E->VectorizedValue;
15950 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
15951 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
15952 MinBWs.
contains(getOperandEntry(E, 0)) ||
15953 MinBWs.
contains(getOperandEntry(E, 1))) &&
15954 "Expected item in MinBWs.");
15955 Type *CastTy = VecTy;
15959 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
15961 ->getIntegerBitWidth())
15978 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
15979 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
15980 auto *AltCI = cast<CmpInst>(E->getAltOp());
15982 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
15985 unsigned SrcBWSz =
DL->getTypeSizeInBits(
15986 cast<VectorType>(
LHS->
getType())->getElementType());
15987 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
15988 if (BWSz <= SrcBWSz) {
15989 if (BWSz < SrcBWSz)
15992 "Expected same type as operand.");
15993 if (
auto *
I = dyn_cast<Instruction>(LHS))
15995 LHS = FinalShuffle(LHS, E);
15996 E->VectorizedValue =
LHS;
15997 ++NumVectorInstructions;
16008 for (
Value *V : {V0, V1}) {
16009 if (
auto *
I = dyn_cast<Instruction>(V)) {
16010 GatherShuffleExtractSeq.
insert(
I);
16011 CSEBlocks.
insert(
I->getParent());
16020 E->buildAltOpShuffleMask(
16022 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16026 Mask, &OpScalars, &AltScalars);
16030 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16032 if (
auto *
I = dyn_cast<Instruction>(Vec);
16033 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16035 if (isa<PoisonValue>(V))
16037 auto *IV = cast<Instruction>(V);
16038 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16040 I->setHasNoUnsignedWrap(
false);
16042 DropNuwFlag(V0, E->getOpcode());
16043 DropNuwFlag(V1, E->getAltOpcode());
16045 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16050 if (
auto *
I = dyn_cast<Instruction>(V)) {
16052 GatherShuffleExtractSeq.
insert(
I);
16053 CSEBlocks.
insert(
I->getParent());
16057 E->VectorizedValue =
V;
16058 ++NumVectorInstructions;
16077 for (
auto &BSIter : BlocksSchedules) {
16078 scheduleBlock(BSIter.second.get());
16082 EntryToLastInstruction.
clear();
16092 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16093 if (GatheredLoadsEntriesFirst.has_value() &&
16094 TE->Idx >= *GatheredLoadsEntriesFirst &&
16095 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16096 assert((!TE->UserTreeIndices.empty() ||
16097 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16098 "Expected gathered load node.");
16104 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16105 if (TE->State == TreeEntry::Vectorize &&
16106 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16107 TE->VectorizedValue)
16113 for (
const TreeEntry *E : PostponedNodes) {
16114 auto *TE =
const_cast<TreeEntry *
>(E);
16115 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16116 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16117 TE->UserTreeIndices.front().EdgeIdx)) &&
16118 VecTE->isSame(TE->Scalars))
16122 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16123 TE->VectorizedValue =
nullptr;
16125 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16134 if (isa<PHINode>(UserI)) {
16137 for (
User *U : PrevVec->users()) {
16140 auto *UI = dyn_cast<Instruction>(U);
16141 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16143 if (UI->comesBefore(InsertPt))
16152 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16157 if (Vec->
getType() != PrevVec->getType()) {
16159 PrevVec->getType()->isIntOrIntVectorTy() &&
16160 "Expected integer vector types only.");
16161 std::optional<bool> IsSigned;
16162 for (
Value *V : TE->Scalars) {
16163 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16164 auto It = MinBWs.
find(BaseTE);
16165 if (It != MinBWs.
end()) {
16166 IsSigned = IsSigned.value_or(
false) || It->second.second;
16170 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16171 auto It = MinBWs.
find(MNTE);
16172 if (It != MinBWs.
end()) {
16173 IsSigned = IsSigned.value_or(
false) || It->second.second;
16178 if (IsSigned.value_or(
false))
16181 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16182 auto It = MinBWs.
find(BVE);
16183 if (It != MinBWs.
end()) {
16184 IsSigned = IsSigned.value_or(
false) || It->second.second;
16189 if (IsSigned.value_or(
false))
16191 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16193 IsSigned.value_or(
false) ||
16197 if (IsSigned.value_or(
false))
16201 if (IsSigned.value_or(
false)) {
16203 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16204 if (It != MinBWs.
end())
16205 IsSigned = It->second.second;
16208 "Expected user node or perfect diamond match in MinBWs.");
16212 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16215 auto It = PostponedValues.
find(PrevVec);
16216 if (It != PostponedValues.
end()) {
16217 for (TreeEntry *VTE : It->getSecond())
16218 VTE->VectorizedValue = Vec;
16238 for (
const auto &ExternalUse : ExternalUses) {
16239 Value *Scalar = ExternalUse.Scalar;
16246 TreeEntry *E = getTreeEntry(Scalar);
16247 assert(E &&
"Invalid scalar");
16248 assert(!E->isGather() &&
"Extracting from a gather list");
16250 if (E->getOpcode() == Instruction::GetElementPtr &&
16251 !isa<GetElementPtrInst>(Scalar))
16254 Value *Vec = E->VectorizedValue;
16255 assert(Vec &&
"Can't find vectorizable value");
16258 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16259 if (Scalar->getType() != Vec->
getType()) {
16260 Value *Ex =
nullptr;
16261 Value *ExV =
nullptr;
16262 auto *Inst = dyn_cast<Instruction>(Scalar);
16263 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16264 auto It = ScalarToEEs.
find(Scalar);
16265 if (It != ScalarToEEs.
end()) {
16268 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16270 if (EEIt != It->second.end()) {
16271 Value *PrevV = EEIt->second.first;
16272 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16273 I && !ReplaceInst &&
16278 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16282 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16290 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16291 IgnoredExtracts.
insert(EE);
16294 auto *CloneInst = Inst->clone();
16295 CloneInst->insertBefore(Inst);
16296 if (Inst->hasName())
16300 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16301 ES && isa<Instruction>(Vec)) {
16302 Value *V = ES->getVectorOperand();
16303 auto *IVec = cast<Instruction>(Vec);
16304 if (
const TreeEntry *ETE = getTreeEntry(V))
16305 V = ETE->VectorizedValue;
16306 if (
auto *
IV = dyn_cast<Instruction>(V);
16307 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16308 IV->comesBefore(IVec))
16312 }
else if (
auto *VecTy =
16313 dyn_cast<FixedVectorType>(Scalar->getType())) {
16322 Vec, Builder.
getInt64(ExternalUse.Lane * VecTyNumElements));
16329 if (Scalar->getType() != Ex->
getType())
16331 Ex, Scalar->getType(),
16333 auto *
I = dyn_cast<Instruction>(Ex);
16335 : &
F->getEntryBlock(),
16336 std::make_pair(Ex, ExV));
16340 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16342 GatherShuffleExtractSeq.
insert(ExI);
16343 CSEBlocks.
insert(ExI->getParent());
16347 assert(isa<FixedVectorType>(Scalar->getType()) &&
16348 isa<InsertElementInst>(Scalar) &&
16349 "In-tree scalar of vector type is not insertelement?");
16350 auto *IE = cast<InsertElementInst>(Scalar);
16358 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16362 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16365 if (ExternalUsesAsOriginalScalar.contains(U))
16367 TreeEntry *UseEntry = getTreeEntry(U);
16369 (UseEntry->State == TreeEntry::Vectorize ||
16371 TreeEntry::StridedVectorize) &&
16372 (E->State == TreeEntry::Vectorize ||
16373 E->State == TreeEntry::StridedVectorize) &&
16374 doesInTreeUserNeedToExtract(
16375 Scalar, getRootEntryInstruction(*UseEntry),
16378 "Scalar with nullptr User must be registered in "
16379 "ExternallyUsedValues map or remain as scalar in vectorized "
16381 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16382 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16383 if (
PHI->getParent()->isLandingPad())
16387 PHI->getParent()->getLandingPadInst()->getIterator()));
16390 PHI->getParent()->getFirstNonPHIIt());
16393 std::next(VecI->getIterator()));
16398 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16400 if (Scalar != NewInst) {
16401 assert((!isa<ExtractElementInst>(Scalar) ||
16402 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16403 "Extractelements should not be replaced.");
16404 Scalar->replaceAllUsesWith(NewInst);
16409 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16412 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16413 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16414 if (!UsedInserts.
insert(VU).second)
16417 auto BWIt = MinBWs.
find(E);
16419 auto *ScalarTy = FTy->getElementType();
16420 auto Key = std::make_pair(Vec, ScalarTy);
16421 auto VecIt = VectorCasts.
find(Key);
16422 if (VecIt == VectorCasts.
end()) {
16424 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16425 if (IVec->getParent()->isLandingPad())
16427 std::next(IVec->getParent()
16428 ->getLandingPadInst()
16432 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16433 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16440 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16441 BWIt->second.second);
16444 Vec = VecIt->second;
16451 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16458 unsigned Idx = *InsertIdx;
16459 if (It == ShuffledInserts.
end()) {
16461 It = std::next(ShuffledInserts.
begin(),
16462 ShuffledInserts.
size() - 1);
16467 Mask[
Idx] = ExternalUse.Lane;
16468 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16477 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16479 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16480 if (PH->getIncomingValue(
I) == Scalar) {
16482 PH->getIncomingBlock(
I)->getTerminator();
16483 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16485 std::next(VecI->getIterator()));
16489 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16490 PH->setOperand(
I, NewInst);
16495 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16500 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16510 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16511 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16513 CombinedMask1[
I] = Mask[
I];
16515 CombinedMask2[
I] = Mask[
I] - VF;
16518 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16519 ShuffleBuilder.
add(V1, CombinedMask1);
16521 ShuffleBuilder.
add(V2, CombinedMask2);
16522 return ShuffleBuilder.
finalize({}, {}, {});
16526 bool ForSingleMask) {
16527 unsigned VF = Mask.size();
16528 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16530 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16531 Vec = CreateShuffle(Vec,
nullptr, Mask);
16532 return std::make_pair(Vec,
true);
16534 if (!ForSingleMask) {
16536 for (
unsigned I = 0;
I < VF; ++
I) {
16538 ResizeMask[Mask[
I]] = Mask[
I];
16540 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16544 return std::make_pair(Vec,
false);
16548 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16554 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16555 Value *NewInst = performExtractsShuffleAction<Value>(
16559 return cast<VectorType>(Vec->getType())
16560 ->getElementCount()
16561 .getKnownMinValue();
16566 assert((Vals.size() == 1 || Vals.size() == 2) &&
16567 "Expected exactly 1 or 2 input values.");
16568 if (Vals.size() == 1) {
16571 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16572 ->getNumElements() ||
16573 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16574 return CreateShuffle(Vals.front(), nullptr, Mask);
16575 return Vals.front();
16577 return CreateShuffle(Vals.
front() ? Vals.
front()
16579 Vals.
back(), Mask);
16581 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16584 if (It != ShuffledInserts[
I].InsertElements.
rend())
16587 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16588 assert(
II &&
"Must be an insertelement instruction.");
16593 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16596 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16597 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16598 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16599 II->moveAfter(NewI);
16602 LastInsert->replaceAllUsesWith(NewInst);
16604 IE->replaceUsesOfWith(IE->getOperand(0),
16606 IE->replaceUsesOfWith(IE->getOperand(1),
16610 CSEBlocks.
insert(LastInsert->getParent());
16615 for (
auto &TEPtr : VectorizableTree) {
16616 TreeEntry *Entry = TEPtr.get();
16619 if (Entry->isGather())
16622 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16625 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16626 Value *Scalar = Entry->Scalars[Lane];
16628 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16629 !isa<GetElementPtrInst>(Scalar))
16631 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16632 EE && IgnoredExtracts.contains(EE))
16634 if (isa<PoisonValue>(Scalar))
16637 Type *Ty = Scalar->getType();
16639 for (
User *U : Scalar->users()) {
16643 assert((getTreeEntry(U) ||
16644 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16645 (isa_and_nonnull<Instruction>(U) &&
16646 isDeleted(cast<Instruction>(U)))) &&
16647 "Deleting out-of-tree value");
16651 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16652 auto *
I = cast<Instruction>(Scalar);
16659 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16660 V->mergeDIAssignID(RemovedInsts);
16663 if (UserIgnoreList) {
16665 const TreeEntry *
IE = getTreeEntry(
I);
16666 if (
IE->Idx != 0 &&
16667 !(VectorizableTree.front()->isGather() &&
16668 !
IE->UserTreeIndices.empty() &&
16669 (ValueToGatherNodes.lookup(
I).contains(
16670 VectorizableTree.front().get()) ||
16672 [&](
const EdgeInfo &EI) {
16673 return EI.UserTE == VectorizableTree.front().get() &&
16674 EI.EdgeIdx == UINT_MAX;
16676 !(GatheredLoadsEntriesFirst.has_value() &&
16677 IE->Idx >= *GatheredLoadsEntriesFirst &&
16678 VectorizableTree.front()->isGather() &&
16684 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16685 (match(U.getUser(), m_LogicalAnd()) ||
16686 match(U.getUser(), m_LogicalOr())) &&
16687 U.getOperandNo() == 0;
16688 if (IsPoisoningLogicalOp) {
16689 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16692 return UserIgnoreList->contains(
U.getUser());
16704 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16707 InstrElementSize.
clear();
16709 const TreeEntry &RootTE = *VectorizableTree.front();
16710 Value *Vec = RootTE.VectorizedValue;
16711 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16712 It != MinBWs.end() &&
16713 ReductionBitWidth != It->second.first) {
16716 ReductionRoot->getIterator());
16720 cast<VectorType>(Vec->
getType())->getElementCount()),
16721 It->second.second);
16728 <<
" gather sequences instructions.\n");
16735 Loop *L = LI->getLoopFor(
I->getParent());
16740 BasicBlock *PreHeader = L->getLoopPreheader();
16748 auto *OpI = dyn_cast<Instruction>(V);
16749 return OpI && L->contains(OpI);
16755 CSEBlocks.
insert(PreHeader);
16770 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
16771 "Different nodes should have different DFS numbers");
16772 return A->getDFSNumIn() <
B->getDFSNumIn();
16782 if (I1->getType() != I2->getType())
16784 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
16785 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
16787 return I1->isIdenticalTo(I2);
16788 if (SI1->isIdenticalTo(SI2))
16790 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
16791 if (SI1->getOperand(
I) != SI2->getOperand(
I))
16794 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
16798 unsigned LastUndefsCnt = 0;
16799 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
16805 NewMask[
I] != SM1[
I])
16808 NewMask[
I] = SM1[
I];
16812 return SM1.
size() - LastUndefsCnt > 1 &&
16816 SM1.
size() - LastUndefsCnt));
16822 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
16825 "Worklist not sorted properly!");
16831 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
16832 !GatherShuffleExtractSeq.contains(&In))
16837 bool Replaced =
false;
16840 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
16841 DT->
dominates(V->getParent(), In.getParent())) {
16842 In.replaceAllUsesWith(V);
16844 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
16845 if (!NewMask.
empty())
16846 SI->setShuffleMask(NewMask);
16850 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
16851 GatherShuffleExtractSeq.contains(V) &&
16852 IsIdenticalOrLessDefined(V, &In, NewMask) &&
16853 DT->
dominates(In.getParent(), V->getParent())) {
16855 V->replaceAllUsesWith(&In);
16857 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
16858 if (!NewMask.
empty())
16859 SI->setShuffleMask(NewMask);
16867 Visited.push_back(&In);
16872 GatherShuffleExtractSeq.clear();
16875BoUpSLP::ScheduleData *
16877 ScheduleData *Bundle =
nullptr;
16878 ScheduleData *PrevInBundle =
nullptr;
16879 for (
Value *V : VL) {
16882 ScheduleData *BundleMember = getScheduleData(V);
16884 "no ScheduleData for bundle member "
16885 "(maybe not in same basic block)");
16886 assert(BundleMember->isSchedulingEntity() &&
16887 "bundle member already part of other bundle");
16888 if (PrevInBundle) {
16889 PrevInBundle->NextInBundle = BundleMember;
16891 Bundle = BundleMember;
16895 BundleMember->FirstInBundle = Bundle;
16896 PrevInBundle = BundleMember;
16898 assert(Bundle &&
"Failed to find schedule bundle");
16904std::optional<BoUpSLP::ScheduleData *>
16906 const InstructionsState &S) {
16909 if (isa<PHINode>(S.getMainOp()) ||
16915 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
16917 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
16918 ScheduleData *Bundle) {
16924 if (ScheduleEnd != OldScheduleEnd) {
16925 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
16926 if (ScheduleData *SD = getScheduleData(
I))
16927 SD->clearDependencies();
16932 <<
" in block " << BB->
getName() <<
"\n");
16933 calculateDependencies(Bundle,
true, SLP);
16938 initialFillReadyList(ReadyInsts);
16945 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
16946 !ReadyInsts.empty()) {
16947 ScheduleData *Picked = ReadyInsts.pop_back_val();
16948 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
16949 "must be ready to schedule");
16950 schedule(Picked, ReadyInsts);
16956 for (
Value *V : VL) {
16959 if (!extendSchedulingRegion(V, S)) {
16966 TryScheduleBundleImpl(
false,
nullptr);
16967 return std::nullopt;
16971 bool ReSchedule =
false;
16972 for (
Value *V : VL) {
16975 ScheduleData *BundleMember = getScheduleData(V);
16977 "no ScheduleData for bundle member (maybe not in same basic block)");
16981 ReadyInsts.remove(BundleMember);
16983 if (!BundleMember->IsScheduled)
16988 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
16989 <<
" was already scheduled\n");
16993 auto *Bundle = buildBundle(VL);
16994 TryScheduleBundleImpl(ReSchedule, Bundle);
16995 if (!Bundle->isReady()) {
16996 cancelScheduling(VL, S.getMainOp());
16997 return std::nullopt;
17010 ScheduleData *Bundle = getScheduleData(OpValue);
17011 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17012 assert(!Bundle->IsScheduled &&
17013 "Can't cancel bundle which is already scheduled");
17014 assert(Bundle->isSchedulingEntity() &&
17016 "tried to unbundle something which is not a bundle");
17019 if (Bundle->isReady())
17020 ReadyInsts.remove(Bundle);
17023 ScheduleData *BundleMember = Bundle;
17024 while (BundleMember) {
17025 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17026 BundleMember->FirstInBundle = BundleMember;
17027 ScheduleData *Next = BundleMember->NextInBundle;
17028 BundleMember->NextInBundle =
nullptr;
17029 BundleMember->TE =
nullptr;
17030 if (BundleMember->unscheduledDepsInBundle() == 0) {
17031 ReadyInsts.insert(BundleMember);
17033 BundleMember = Next;
17037BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17039 if (ChunkPos >= ChunkSize) {
17040 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17043 return &(ScheduleDataChunks.back()[ChunkPos++]);
17046bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17047 Value *V,
const InstructionsState &S) {
17049 assert(
I &&
"bundle member must be an instruction");
17052 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17054 if (getScheduleData(
I))
17056 if (!ScheduleStart) {
17058 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17060 ScheduleEnd =
I->getNextNode();
17061 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17062 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17070 ++ScheduleStart->getIterator().getReverse();
17075 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17076 return II->isAssumeLikeIntrinsic();
17079 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17080 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17081 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17083 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17084 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17091 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17092 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17094 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17095 assert(
I->getParent() == ScheduleStart->getParent() &&
17096 "Instruction is in wrong basic block.");
17097 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17103 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17104 "Expected to reach top of the basic block or instruction down the "
17106 assert(
I->getParent() == ScheduleEnd->getParent() &&
17107 "Instruction is in wrong basic block.");
17108 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17110 ScheduleEnd =
I->getNextNode();
17111 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17112 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17116void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17118 ScheduleData *PrevLoadStore,
17119 ScheduleData *NextLoadStore) {
17120 ScheduleData *CurrentLoadStore = PrevLoadStore;
17125 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17127 SD = allocateScheduleDataChunks();
17128 ScheduleDataMap[
I] = SD;
17130 assert(!isInSchedulingRegion(SD) &&
17131 "new ScheduleData already in scheduling region");
17132 SD->init(SchedulingRegionID,
I);
17134 if (
I->mayReadOrWriteMemory() &&
17135 (!isa<IntrinsicInst>(
I) ||
17136 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17137 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17138 Intrinsic::pseudoprobe))) {
17140 if (CurrentLoadStore) {
17141 CurrentLoadStore->NextLoadStore = SD;
17143 FirstLoadStoreInRegion = SD;
17145 CurrentLoadStore = SD;
17148 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17149 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17150 RegionHasStackSave =
true;
17152 if (NextLoadStore) {
17153 if (CurrentLoadStore)
17154 CurrentLoadStore->NextLoadStore = NextLoadStore;
17156 LastLoadStoreInRegion = CurrentLoadStore;
17160void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17161 bool InsertInReadyList,
17163 assert(SD->isSchedulingEntity());
17168 while (!WorkList.
empty()) {
17170 for (ScheduleData *BundleMember = SD; BundleMember;
17171 BundleMember = BundleMember->NextInBundle) {
17172 assert(isInSchedulingRegion(BundleMember));
17173 if (BundleMember->hasValidDependencies())
17178 BundleMember->Dependencies = 0;
17179 BundleMember->resetUnscheduledDeps();
17182 for (
User *U : BundleMember->Inst->
users()) {
17183 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17184 BundleMember->Dependencies++;
17185 ScheduleData *DestBundle = UseSD->FirstInBundle;
17186 if (!DestBundle->IsScheduled)
17187 BundleMember->incrementUnscheduledDeps(1);
17188 if (!DestBundle->hasValidDependencies())
17194 auto *DepDest = getScheduleData(
I);
17195 assert(DepDest &&
"must be in schedule window");
17196 DepDest->ControlDependencies.push_back(BundleMember);
17197 BundleMember->Dependencies++;
17198 ScheduleData *DestBundle = DepDest->FirstInBundle;
17199 if (!DestBundle->IsScheduled)
17200 BundleMember->incrementUnscheduledDeps(1);
17201 if (!DestBundle->hasValidDependencies())
17209 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17210 I != ScheduleEnd;
I =
I->getNextNode()) {
17215 MakeControlDependent(
I);
17223 if (RegionHasStackSave) {
17227 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17228 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17229 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17230 I != ScheduleEnd;
I =
I->getNextNode()) {
17231 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17232 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17237 if (!isa<AllocaInst>(
I))
17241 MakeControlDependent(
I);
17250 if (isa<AllocaInst>(BundleMember->Inst) ||
17251 BundleMember->Inst->mayReadOrWriteMemory()) {
17252 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17253 I != ScheduleEnd;
I =
I->getNextNode()) {
17254 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17255 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17259 MakeControlDependent(
I);
17266 ScheduleData *DepDest = BundleMember->NextLoadStore;
17271 "NextLoadStore list for non memory effecting bundle?");
17273 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17274 unsigned NumAliased = 0;
17275 unsigned DistToSrc = 1;
17277 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17278 assert(isInSchedulingRegion(DepDest));
17288 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17290 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17297 DepDest->MemoryDependencies.push_back(BundleMember);
17298 BundleMember->Dependencies++;
17299 ScheduleData *DestBundle = DepDest->FirstInBundle;
17300 if (!DestBundle->IsScheduled) {
17301 BundleMember->incrementUnscheduledDeps(1);
17303 if (!DestBundle->hasValidDependencies()) {
17326 if (InsertInReadyList && SD->isReady()) {
17327 ReadyInsts.insert(SD);
17334void BoUpSLP::BlockScheduling::resetSchedule() {
17336 "tried to reset schedule on block which has not been scheduled");
17337 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17338 if (ScheduleData *SD = getScheduleData(
I)) {
17339 assert(isInSchedulingRegion(SD) &&
17340 "ScheduleData not in scheduling region");
17341 SD->IsScheduled =
false;
17342 SD->resetUnscheduledDeps();
17345 ReadyInsts.clear();
17348void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17349 if (!BS->ScheduleStart)
17352 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17359 BS->resetSchedule();
17366 struct ScheduleDataCompare {
17367 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17368 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17371 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17376 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17377 I =
I->getNextNode()) {
17378 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17379 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17382 SD->isPartOfBundle() ==
17384 "scheduler and vectorizer bundle mismatch");
17385 SD->FirstInBundle->SchedulingPriority =
Idx++;
17387 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17388 BS->calculateDependencies(SD,
false,
this);
17391 BS->initialFillReadyList(ReadyInsts);
17393 Instruction *LastScheduledInst = BS->ScheduleEnd;
17396 while (!ReadyInsts.empty()) {
17397 ScheduleData *Picked = *ReadyInsts.begin();
17398 ReadyInsts.erase(ReadyInsts.begin());
17402 for (ScheduleData *BundleMember = Picked; BundleMember;
17403 BundleMember = BundleMember->NextInBundle) {
17407 LastScheduledInst = PickedInst;
17410 BS->schedule(Picked, ReadyInsts);
17414#ifdef EXPENSIVE_CHECKS
17418#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17420 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17421 ScheduleData *SD = BS->getScheduleData(
I);
17422 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17423 assert(SD->IsScheduled &&
"must be scheduled at this point");
17428 BS->ScheduleStart =
nullptr;
17435 if (
auto *Store = dyn_cast<StoreInst>(V))
17436 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17438 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17441 auto E = InstrElementSize.
find(V);
17442 if (E != InstrElementSize.
end())
17451 if (
auto *
I = dyn_cast<Instruction>(V)) {
17459 Value *FirstNonBool =
nullptr;
17460 while (!Worklist.
empty()) {
17465 auto *Ty =
I->getType();
17466 if (isa<VectorType>(Ty))
17468 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17475 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17476 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17484 for (
Use &U :
I->operands()) {
17485 if (
auto *J = dyn_cast<Instruction>(U.get()))
17486 if (Visited.
insert(J).second &&
17487 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17491 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17492 FirstNonBool = U.get();
17503 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17505 Width =
DL->getTypeSizeInBits(V->getType());
17509 InstrElementSize[
I] = Width;
17514bool BoUpSLP::collectValuesToDemote(
17515 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17518 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17520 if (
all_of(E.Scalars, IsaPred<Constant>))
17523 unsigned OrigBitWidth =
17524 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17531 if (NodesToKeepBWs.
contains(E.Idx))
17537 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17538 if (isa<PoisonValue>(R))
17540 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17542 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17543 if (isa<PoisonValue>(V))
17551 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17557 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17560 if (
auto *
I = dyn_cast<Instruction>(V)) {
17562 unsigned BitWidth2 =
17563 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17564 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17570 BitWidth1 = std::min(BitWidth1, BitWidth2);
17575 using namespace std::placeholders;
17576 auto FinalAnalysis = [&]() {
17577 if (!IsProfitableToDemote)
17580 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17582 if (Res && E.isGather()) {
17586 for (
Value *V : E.Scalars) {
17587 auto *EE = dyn_cast<ExtractElementInst>(V);
17590 UniqueBases.
insert(EE->getVectorOperand());
17592 const unsigned VF = E.Scalars.size();
17593 Type *OrigScalarTy = E.Scalars.front()->getType();
17594 if (UniqueBases.
size() <= 2 ||
17602 if (E.isGather() || !Visited.
insert(&E).second ||
17604 return all_of(V->users(), [&](User *U) {
17605 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17608 return FinalAnalysis();
17611 return !all_of(V->users(), [=](User *U) {
17612 return getTreeEntry(U) ||
17613 (E.Idx == 0 && UserIgnoreList &&
17614 UserIgnoreList->contains(U)) ||
17615 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17616 !U->getType()->isScalableTy() &&
17617 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17618 }) && !IsPotentiallyTruncated(V,
BitWidth);
17623 bool &NeedToExit) {
17624 NeedToExit =
false;
17625 unsigned InitLevel = MaxDepthLevel;
17627 unsigned Level = InitLevel;
17628 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17629 ToDemote, Visited, NodesToKeepBWs, Level,
17630 IsProfitableToDemote, IsTruncRoot)) {
17631 if (!IsProfitableToDemote)
17634 if (!FinalAnalysis())
17638 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17642 auto AttemptCheckBitwidth =
17645 NeedToExit =
false;
17646 unsigned BestFailBitwidth = 0;
17648 if (Checker(
BitWidth, OrigBitWidth))
17650 if (BestFailBitwidth == 0 && FinalAnalysis())
17654 if (BestFailBitwidth == 0) {
17665 auto TryProcessInstruction =
17671 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17676 if (E.UserTreeIndices.size() > 1 &&
17677 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17680 bool NeedToExit =
false;
17681 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17685 if (!ProcessOperands(
Operands, NeedToExit))
17694 return IsProfitableToDemote;
17696 switch (E.getOpcode()) {
17700 case Instruction::Trunc:
17701 if (IsProfitableToDemoteRoot)
17702 IsProfitableToDemote =
true;
17703 return TryProcessInstruction(
BitWidth);
17704 case Instruction::ZExt:
17705 case Instruction::SExt:
17706 IsProfitableToDemote =
true;
17707 return TryProcessInstruction(
BitWidth);
17711 case Instruction::Add:
17712 case Instruction::Sub:
17713 case Instruction::Mul:
17714 case Instruction::And:
17715 case Instruction::Or:
17716 case Instruction::Xor: {
17717 return TryProcessInstruction(
17718 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17720 case Instruction::Freeze:
17721 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17722 case Instruction::Shl: {
17727 if (isa<PoisonValue>(V))
17729 auto *I = cast<Instruction>(V);
17730 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17731 return AmtKnownBits.getMaxValue().ult(BitWidth);
17734 return TryProcessInstruction(
17735 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
17737 case Instruction::LShr: {
17741 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17743 if (isa<PoisonValue>(V))
17745 auto *I = cast<Instruction>(V);
17746 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17747 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17748 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17749 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
17750 SimplifyQuery(*DL));
17753 return TryProcessInstruction(
17754 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17757 case Instruction::AShr: {
17761 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17763 if (isa<PoisonValue>(V))
17765 auto *I = cast<Instruction>(V);
17766 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17767 unsigned ShiftedBits = OrigBitWidth - BitWidth;
17768 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
17769 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
17773 return TryProcessInstruction(
17774 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
17777 case Instruction::UDiv:
17778 case Instruction::URem: {
17780 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17783 auto *I = cast<Instruction>(V);
17784 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17785 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
17786 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17789 return TryProcessInstruction(
17790 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
17794 case Instruction::Select: {
17795 return TryProcessInstruction(
17796 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
17801 case Instruction::PHI: {
17802 const unsigned NumOps = E.getNumOperands();
17805 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
17807 return TryProcessInstruction(
BitWidth, Ops);
17810 case Instruction::Call: {
17811 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
17815 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
17816 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
17820 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17823 auto *I = cast<Instruction>(V);
17824 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
17825 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17826 return MaskedValueIsZero(I->getOperand(0), Mask,
17827 SimplifyQuery(*DL)) &&
17828 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
17830 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
17831 "Expected min/max intrinsics only.");
17832 unsigned SignBits = OrigBitWidth -
BitWidth;
17838 return SignBits <= Op0SignBits &&
17839 ((SignBits != Op0SignBits &&
17843 SignBits <= Op1SignBits &&
17844 ((SignBits != Op1SignBits &&
17849 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
17852 auto *I = cast<Instruction>(V);
17853 unsigned SignBits = OrigBitWidth - BitWidth;
17854 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
17855 unsigned Op0SignBits =
17856 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
17857 return SignBits <= Op0SignBits &&
17858 ((SignBits != Op0SignBits &&
17859 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
17860 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
17863 if (
ID != Intrinsic::abs) {
17864 Operands.push_back(getOperandEntry(&E, 1));
17865 CallChecker = CompChecker;
17867 CallChecker = AbsChecker;
17870 std::numeric_limits<InstructionCost::CostType>::max();
17872 unsigned VF = E.Scalars.size();
17882 if (
Cost < BestCost) {
17888 [[maybe_unused]]
bool NeedToExit;
17889 (void)AttemptCheckBitwidth(Checker, NeedToExit);
17899 return FinalAnalysis();
17906 bool IsStoreOrInsertElt =
17907 VectorizableTree.front()->getOpcode() == Instruction::Store ||
17908 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
17909 if ((IsStoreOrInsertElt || UserIgnoreList) &&
17910 ExtraBitWidthNodes.
size() <= 1 &&
17911 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
17912 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
17915 unsigned NodeIdx = 0;
17916 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
17920 if (VectorizableTree[NodeIdx]->
isGather() ||
17921 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
17922 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
17924 return EI.
UserTE->Idx > NodeIdx;
17930 bool IsTruncRoot =
false;
17931 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
17934 if (NodeIdx != 0 &&
17935 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
17936 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
17937 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
17938 IsTruncRoot =
true;
17940 IsProfitableToDemoteRoot =
true;
17945 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
17949 auto ComputeMaxBitWidth = [&](
const TreeEntry &E,
bool IsTopRoot,
17950 bool IsProfitableToDemoteRoot,
unsigned Opcode,
17951 unsigned Limit,
bool IsTruncRoot,
17952 bool IsSignedCmp) ->
unsigned {
17956 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
17957 !NodesToKeepBWs.
contains(E.Idx) &&
17958 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
17960 return V->hasOneUse() || isa<Constant>(V) ||
17963 const TreeEntry *TE = getTreeEntry(U);
17964 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
17965 if (TE == UserTE || !TE)
17967 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
17969 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
17970 SelectInst>(UserTE->getMainOp()))
17972 unsigned UserTESz = DL->getTypeSizeInBits(
17973 UserTE->Scalars.front()->getType());
17974 auto It = MinBWs.find(TE);
17975 if (It != MinBWs.end() && It->second.first > UserTESz)
17977 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
17981 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
17982 auto It = MinBWs.
find(UserTE);
17983 if (It != MinBWs.
end())
17984 return It->second.first;
17985 unsigned MaxBitWidth =
17986 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
17987 MaxBitWidth =
bit_ceil(MaxBitWidth);
17988 if (MaxBitWidth < 8 && MaxBitWidth > 1)
17990 return MaxBitWidth;
17993 unsigned VF = E.getVectorFactor();
17994 Type *ScalarTy = E.Scalars.front()->getType();
17996 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
17997 if (!TreeRootIT || !Opcode)
18001 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18010 unsigned MaxBitWidth = 1u;
18018 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18019 if (isa<PoisonValue>(R))
18021 KnownBits Known = computeKnownBits(R, *DL);
18022 return Known.isNonNegative();
18027 for (
Value *Root : E.Scalars) {
18028 if (isa<PoisonValue>(Root))
18033 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18049 if (!IsKnownPositive)
18053 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18055 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18058 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18063 if (NumParts > 1 &&
18069 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18070 Opcode == Instruction::SExt ||
18071 Opcode == Instruction::ZExt || NumParts > 1;
18076 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18077 bool NeedToDemote = IsProfitableToDemote;
18079 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18080 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18081 NeedToDemote, IsTruncRoot) ||
18082 (MaxDepthLevel <= Limit &&
18083 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18084 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18085 DL->getTypeSizeInBits(TreeRootIT) /
18086 DL->getTypeSizeInBits(
18087 E.getMainOp()->getOperand(0)->getType()) >
18091 MaxBitWidth =
bit_ceil(MaxBitWidth);
18093 return MaxBitWidth;
18100 if (UserIgnoreList &&
18101 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18104 if (
all_of(*UserIgnoreList,
18106 return isa<PoisonValue>(V) ||
18107 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18109 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18110 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18111 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18113 ReductionBitWidth = 1;
18115 for (
Value *V : *UserIgnoreList) {
18116 if (isa<PoisonValue>(V))
18119 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18120 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18123 unsigned BitWidth2 = BitWidth1;
18126 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18128 ReductionBitWidth =
18129 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18131 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18132 ReductionBitWidth = 8;
18134 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18137 bool IsTopRoot = NodeIdx == 0;
18138 while (NodeIdx < VectorizableTree.size() &&
18139 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18140 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18143 IsTruncRoot =
true;
18145 bool IsSignedCmp =
false;
18146 while (NodeIdx < VectorizableTree.size()) {
18148 unsigned Limit = 2;
18149 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
18151 ReductionBitWidth ==
18152 DL->getTypeSizeInBits(
18153 VectorizableTree.front()->Scalars.front()->getType()))
18155 unsigned MaxBitWidth = ComputeMaxBitWidth(
18156 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
18157 Limit, IsTruncRoot, IsSignedCmp);
18158 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18159 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18160 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18161 else if (MaxBitWidth == 0)
18162 ReductionBitWidth = 0;
18165 for (
unsigned Idx : RootDemotes) {
18168 DL->getTypeSizeInBits(V->getType()->getScalarType());
18169 if (OrigBitWidth > MaxBitWidth) {
18177 RootDemotes.clear();
18179 IsProfitableToDemoteRoot =
true;
18181 if (ExtraBitWidthNodes.
empty()) {
18182 NodeIdx = VectorizableTree.size();
18184 unsigned NewIdx = 0;
18186 NewIdx = *ExtraBitWidthNodes.
begin();
18187 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18188 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18191 NodeIdx < VectorizableTree.size() &&
18192 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18195 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18196 !EI.
UserTE->isAltShuffle();
18199 NodeIdx < VectorizableTree.size() &&
18200 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18202 return EI.
UserTE->getOpcode() == Instruction::ICmp &&
18204 auto *IC = dyn_cast<ICmpInst>(V);
18207 !isKnownNonNegative(IC->getOperand(0),
18208 SimplifyQuery(*DL)) ||
18209 !isKnownNonNegative(IC->getOperand(1),
18210 SimplifyQuery(*DL)));
18217 if (MaxBitWidth == 0 ||
18219 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18221 if (UserIgnoreList)
18229 for (
unsigned Idx : ToDemote) {
18230 TreeEntry *TE = VectorizableTree[
Idx].get();
18233 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18234 if (isa<PoisonValue>(R))
18236 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18254 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18279 DL = &
F.getDataLayout();
18283 bool Changed =
false;
18289 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18294 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18297 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18301 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18310 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18315 R.clearReductionData();
18316 collectSeedInstructions(BB);
18319 if (!Stores.
empty()) {
18321 <<
" underlying objects.\n");
18322 Changed |= vectorizeStoreChains(R);
18326 Changed |= vectorizeChainsInBlock(BB, R);
18331 if (!GEPs.
empty()) {
18333 <<
" underlying objects.\n");
18334 Changed |= vectorizeGEPIndices(BB, R);
18339 R.optimizeGatherSequence();
18347 unsigned Idx,
unsigned MinVF,
18352 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18353 unsigned VF = Chain.
size();
18357 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18359 VF < 2 || VF < MinVF) {
18371 for (
Value *V : Chain)
18372 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18375 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18377 bool IsAllowedSize =
18381 if ((!IsAllowedSize && S.getOpcode() &&
18382 S.getOpcode() != Instruction::Load &&
18383 (!S.getMainOp()->isSafeToRemove() ||
18386 return !isa<ExtractElementInst>(V) &&
18387 (V->getNumUses() > Chain.size() ||
18388 any_of(V->users(), [&](User *U) {
18389 return !Stores.contains(U);
18392 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
18393 Size = (!IsAllowedSize && S.getOpcode()) ? 1 : 2;
18397 if (
R.isLoadCombineCandidate(Chain))
18399 R.buildTree(Chain);
18401 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18402 if (
R.isGathered(Chain.front()) ||
18403 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18404 return std::nullopt;
18405 Size =
R.getCanonicalGraphSize();
18408 R.reorderTopToBottom();
18409 R.reorderBottomToTop();
18410 R.transformNodes();
18411 R.buildExternalUses();
18413 R.computeMinimumValueSizes();
18415 Size =
R.getCanonicalGraphSize();
18416 if (S.getOpcode() == Instruction::Load)
18424 using namespace ore;
18427 cast<StoreInst>(Chain[0]))
18428 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18429 <<
" and with tree size "
18430 <<
NV(
"TreeSize",
R.getTreeSize()));
18444 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18445 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18446 unsigned Size = First ? Val.first : Val.second;
18458 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18459 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18460 unsigned P = First ? Val.first : Val.second;
18463 return V + (P - Mean) * (P - Mean);
18466 return Dev * 81 / (Mean * Mean) == 0;
18469bool SLPVectorizerPass::vectorizeStores(
18471 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18476 bool Changed =
false;
18478 struct StoreDistCompare {
18479 bool operator()(
const std::pair<unsigned, int> &Op1,
18480 const std::pair<unsigned, int> &Op2)
const {
18481 return Op1.second < Op2.second;
18486 using StoreIndexToDistSet =
18487 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18488 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18493 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18495 PrevDist =
Data.second;
18496 if (
Idx !=
Set.size() - 1)
18501 Operands.push_back(Stores[DataVar.first]);
18502 PrevDist = DataVar.second;
18507 .
insert({Operands.front(),
18508 cast<StoreInst>(Operands.front())->getValueOperand(),
18510 cast<StoreInst>(Operands.back())->getValueOperand(),
18515 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18516 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18520 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18522 Type *StoreTy =
Store->getValueOperand()->getType();
18523 Type *ValueTy = StoreTy;
18524 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18525 ValueTy = Trunc->getSrcTy();
18526 unsigned MinVF = std::max<unsigned>(
18528 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18531 if (MaxVF < MinVF) {
18532 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18534 <<
"MinVF (" << MinVF <<
")\n");
18538 unsigned NonPowerOf2VF = 0;
18543 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18545 NonPowerOf2VF = CandVF;
18546 assert(NonPowerOf2VF != MaxVF &&
18547 "Non-power-of-2 VF should not be equal to MaxVF");
18551 unsigned MaxRegVF = MaxVF;
18553 if (MaxVF < MinVF) {
18554 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18556 <<
"MinVF (" << MinVF <<
")\n");
18562 unsigned Size = MinVF;
18564 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18568 unsigned Repeat = 0;
18569 constexpr unsigned MaxAttempts = 4;
18571 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18572 P.first =
P.second = 1;
18575 auto IsNotVectorized = [](
bool First,
18576 const std::pair<unsigned, unsigned> &
P) {
18577 return First ?
P.first > 0 :
P.second > 0;
18579 auto IsVectorized = [](
bool First,
18580 const std::pair<unsigned, unsigned> &
P) {
18581 return First ?
P.first == 0 :
P.second == 0;
18583 auto VFIsProfitable = [](
bool First,
unsigned Size,
18584 const std::pair<unsigned, unsigned> &
P) {
18587 auto FirstSizeSame = [](
unsigned Size,
18588 const std::pair<unsigned, unsigned> &
P) {
18589 return Size ==
P.first;
18593 bool RepeatChanged =
false;
18594 bool AnyProfitableGraph =
false;
18595 for (
unsigned Size : CandidateVFs) {
18596 AnyProfitableGraph =
false;
18597 unsigned StartIdx = std::distance(
18598 RangeSizes.begin(),
18599 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18600 std::placeholders::_1)));
18601 while (StartIdx <
End) {
18603 std::distance(RangeSizes.begin(),
18604 find_if(RangeSizes.drop_front(StartIdx),
18605 std::bind(IsVectorized,
Size >= MaxRegVF,
18606 std::placeholders::_1)));
18607 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18608 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18610 Size >= MaxRegVF)) {
18617 return cast<StoreInst>(V)
18618 ->getValueOperand()
18620 cast<StoreInst>(Slice.
front())
18621 ->getValueOperand()
18624 "Expected all operands of same type.");
18625 if (!NonSchedulable.empty()) {
18626 auto [NonSchedSizeMax, NonSchedSizeMin] =
18627 NonSchedulable.lookup(Slice.
front());
18628 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18629 Cnt += NonSchedSizeMax;
18634 std::optional<bool> Res =
18635 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18639 .first->getSecond()
18647 AnyProfitableGraph = RepeatChanged = Changed =
true;
18651 [](std::pair<unsigned, unsigned> &
P) {
18652 P.first = P.second = 0;
18654 if (Cnt < StartIdx + MinVF) {
18655 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18656 [](std::pair<unsigned, unsigned> &
P) {
18657 P.first = P.second = 0;
18659 StartIdx = Cnt +
Size;
18661 if (Cnt > Sz -
Size - MinVF) {
18663 [](std::pair<unsigned, unsigned> &
P) {
18664 P.first = P.second = 0;
18673 if (
Size > 2 && Res &&
18675 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18676 std::placeholders::_1))) {
18682 if (
Size > MaxRegVF && TreeSize > 1 &&
18684 std::bind(FirstSizeSame, TreeSize,
18685 std::placeholders::_1))) {
18687 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18693 [&](std::pair<unsigned, unsigned> &
P) {
18694 if (Size >= MaxRegVF)
18695 P.second = std::max(P.second, TreeSize);
18697 P.first = std::max(P.first, TreeSize);
18700 AnyProfitableGraph =
true;
18702 if (StartIdx >=
End)
18704 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18705 AnyProfitableGraph =
true;
18706 StartIdx = std::distance(
18707 RangeSizes.begin(),
18708 find_if(RangeSizes.drop_front(Sz),
18709 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18710 std::placeholders::_1)));
18716 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18717 return P.first == 0 &&
P.second == 0;
18721 if (Repeat >= MaxAttempts ||
18722 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18724 constexpr unsigned StoresLimit = 64;
18725 const unsigned MaxTotalNum = std::min<unsigned>(
18727 static_cast<unsigned>(
18730 RangeSizes.begin(),
18731 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
18732 std::placeholders::_1))) +
18734 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
18737 CandidateVFs.clear();
18739 CandidateVFs.push_back(Limit);
18740 if (VF > MaxTotalNum || VF >= StoresLimit)
18742 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
18744 P.first = std::max(
P.second,
P.first);
18748 CandidateVFs.push_back(VF);
18795 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
18797 Stores[
Set.first]->getValueOperand()->getType(),
18798 Stores[
Set.first]->getPointerOperand(),
18799 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
18803 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
18804 if (It ==
Set.second.end()) {
18805 Set.second.emplace(
Idx, *Diff);
18809 TryToVectorize(
Set.second);
18810 unsigned ItIdx = It->first;
18811 int ItDist = It->second;
18812 StoreIndexToDistSet PrevSet;
18813 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
18814 [&](
const std::pair<unsigned, int> &Pair) {
18815 return Pair.first > ItIdx;
18817 Set.second.clear();
18819 Set.second.emplace(
Idx, 0);
18822 unsigned StartIdx = ItIdx + 1;
18827 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
18829 if (VectorizedStores.
contains(Stores[Pair.first]))
18831 unsigned BI = Pair.first - StartIdx;
18832 UsedStores.set(BI);
18833 Dists[BI] = Pair.second - ItDist;
18835 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
18836 unsigned BI =
I - StartIdx;
18837 if (UsedStores.test(BI))
18838 Set.second.emplace(
I, Dists[BI]);
18842 auto &Res = SortedStores.emplace_back();
18844 Res.second.emplace(
Idx, 0);
18846 Type *PrevValTy =
nullptr;
18848 if (
R.isDeleted(SI))
18851 PrevValTy =
SI->getValueOperand()->getType();
18853 if (PrevValTy !=
SI->getValueOperand()->getType()) {
18854 for (
auto &Set : SortedStores)
18855 TryToVectorize(
Set.second);
18856 SortedStores.clear();
18857 PrevValTy =
SI->getValueOperand()->getType();
18859 FillStoresSet(
I, SI);
18863 for (
auto &Set : SortedStores)
18864 TryToVectorize(
Set.second);
18869void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
18880 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
18881 if (!
SI->isSimple())
18891 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
18892 if (
GEP->getNumIndices() != 1)
18895 if (isa<Constant>(
Idx))
18899 if (
GEP->getType()->isVectorTy())
18911 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
18912 << VL.
size() <<
".\n");
18917 if (!S.getOpcode())
18923 for (
Value *V : VL) {
18924 Type *Ty =
V->getType();
18928 R.getORE()->emit([&]() {
18929 std::string TypeStr;
18933 <<
"Cannot SLP vectorize list: type "
18934 << TypeStr +
" is unsupported by vectorizer";
18940 unsigned Sz =
R.getVectorElementSize(I0);
18941 unsigned MinVF =
R.getMinVF(Sz);
18942 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
18943 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
18945 R.getORE()->emit([&]() {
18947 <<
"Cannot SLP vectorize list: vectorization factor "
18948 <<
"less than 2 is not supported";
18953 bool Changed =
false;
18954 bool CandidateFound =
false;
18958 unsigned NextInst = 0, MaxInst = VL.size();
18959 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
18966 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
18967 unsigned ActualVF = std::min(MaxInst -
I, VF);
18972 if (MaxVFOnly && ActualVF < MaxVF)
18974 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
18979 for (
Value *V : VL.drop_front(
I)) {
18982 if (
auto *Inst = dyn_cast<Instruction>(V);
18983 !Inst || !
R.isDeleted(Inst)) {
18986 if (
Idx == ActualVF)
18991 if (
Idx != ActualVF)
18994 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
18998 if (
R.isTreeTinyAndNotFullyVectorizable())
19000 R.reorderTopToBottom();
19001 R.reorderBottomToTop(
19002 !isa<InsertElementInst>(Ops.
front()) &&
19003 !
R.doesRootHaveInTreeUses());
19004 R.transformNodes();
19005 R.buildExternalUses();
19007 R.computeMinimumValueSizes();
19009 CandidateFound =
true;
19010 MinCost = std::min(MinCost,
Cost);
19013 <<
" for VF=" << ActualVF <<
"\n");
19017 cast<Instruction>(Ops[0]))
19018 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19019 <<
" and with tree size "
19020 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19031 if (!Changed && CandidateFound) {
19032 R.getORE()->emit([&]() {
19034 <<
"List vectorization was possible but not beneficial with cost "
19035 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19038 }
else if (!Changed) {
19039 R.getORE()->emit([&]() {
19041 <<
"Cannot SLP vectorize list: vectorization was impossible"
19042 <<
" with available vectorization factors";
19052 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19058 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19059 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19060 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19061 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19068 auto *
A = dyn_cast<BinaryOperator>(Op0);
19069 auto *
B = dyn_cast<BinaryOperator>(Op1);
19071 if (
A &&
B &&
B->hasOneUse()) {
19072 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19073 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19074 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19076 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19080 if (
B &&
A &&
A->hasOneUse()) {
19081 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19082 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19083 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19085 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19089 if (Candidates.
size() == 1)
19090 return tryToVectorizeList({Op0, Op1},
R);
19093 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19094 if (!BestCandidate)
19096 return tryToVectorizeList(
19097 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19131 ReductionOpsListType ReductionOps;
19141 bool IsSupportedHorRdxIdentityOp =
false;
19152 return isa<SelectInst>(
I) &&
19158 if (Kind == RecurKind::None)
19166 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19170 return I->getFastMathFlags().noNaNs();
19173 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19176 return I->isAssociative();
19185 return I->getOperand(2);
19186 return I->getOperand(
Index);
19194 case RecurKind::Or:
19200 case RecurKind::And:
19206 case RecurKind::Add:
19207 case RecurKind::Mul:
19208 case RecurKind::Xor:
19209 case RecurKind::FAdd:
19210 case RecurKind::FMul:
19213 case RecurKind::FMax:
19215 case RecurKind::FMin:
19217 case RecurKind::FMaximum:
19219 case RecurKind::FMinimum:
19221 case RecurKind::SMax:
19227 case RecurKind::SMin:
19233 case RecurKind::UMax:
19239 case RecurKind::UMin:
19254 const ReductionOpsListType &ReductionOps) {
19255 bool UseSelect = ReductionOps.size() == 2 ||
19257 (ReductionOps.size() == 1 &&
19258 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19259 assert((!UseSelect || ReductionOps.size() != 2 ||
19260 isa<SelectInst>(ReductionOps[1][0])) &&
19261 "Expected cmp + select pairs for reduction");
19264 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19278 auto *
I = dyn_cast<Instruction>(V);
19280 return RecurKind::None;
19282 return RecurKind::Add;
19284 return RecurKind::Mul;
19287 return RecurKind::And;
19290 return RecurKind::Or;
19292 return RecurKind::Xor;
19294 return RecurKind::FAdd;
19296 return RecurKind::FMul;
19299 return RecurKind::FMax;
19301 return RecurKind::FMin;
19304 return RecurKind::FMaximum;
19306 return RecurKind::FMinimum;
19312 return RecurKind::SMax;
19314 return RecurKind::SMin;
19316 return RecurKind::UMax;
19318 return RecurKind::UMin;
19320 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19342 if (!isa<ExtractElementInst>(
RHS) ||
19344 return RecurKind::None;
19346 if (!isa<ExtractElementInst>(
LHS) ||
19348 return RecurKind::None;
19350 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19351 return RecurKind::None;
19355 return RecurKind::None;
19360 return RecurKind::None;
19363 return RecurKind::SMax;
19366 return RecurKind::SMin;
19369 return RecurKind::UMax;
19372 return RecurKind::UMin;
19375 return RecurKind::None;
19379 static unsigned getFirstOperandIndex(
Instruction *
I) {
19380 return isCmpSelMinMax(
I) ? 1 : 0;
19386 return isCmpSelMinMax(
I) ? 3 : 2;
19392 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19393 auto *Sel = cast<SelectInst>(
I);
19394 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19395 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19397 return I->getParent() == BB;
19401 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19402 if (IsCmpSelMinMax) {
19405 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19406 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19407 return I->hasNUses(2);
19411 return I->hasOneUse();
19416 if (isCmpSelMinMax(
I))
19417 ReductionOps.assign(2, ReductionOpsType());
19419 ReductionOps.assign(1, ReductionOpsType());
19424 if (isCmpSelMinMax(
I)) {
19425 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19426 ReductionOps[1].emplace_back(
I);
19428 ReductionOps[0].emplace_back(
I);
19433 int Sz = Data.size();
19434 auto *
I = dyn_cast<Instruction>(Data.front());
19435 return Sz > 1 ||
isConstant(Data.front()) ||
19446 RdxKind = HorizontalReduction::getRdxKind(Root);
19447 if (!isVectorizable(RdxKind, Root))
19458 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19459 if (!Sel->getCondition()->hasOneUse())
19462 ReductionRoot = Root;
19467 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19469 1, std::make_pair(Root, 0));
19477 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19478 getNumberOfOperands(TreeN)))) {
19479 Value *EdgeVal = getRdxOperand(TreeN,
I);
19480 ReducedValsToOps[EdgeVal].push_back(TreeN);
19481 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19488 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19489 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19490 !isVectorizable(RdxKind, EdgeInst) ||
19491 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19492 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19493 PossibleReducedVals.push_back(EdgeVal);
19496 ReductionOps.push_back(EdgeInst);
19507 PossibleReducedVals;
19508 initReductionOps(Root);
19512 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19516 if (!LoadKeyUsed.
insert(Key).second) {
19517 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19518 if (LIt != LoadsMap.
end()) {
19519 for (
LoadInst *RLI : LIt->second) {
19525 for (
LoadInst *RLI : LIt->second) {
19532 if (LIt->second.size() > 2) {
19534 hash_value(LIt->second.back()->getPointerOperand());
19540 .first->second.push_back(LI);
19544 while (!Worklist.empty()) {
19545 auto [TreeN, Level] = Worklist.pop_back_val();
19548 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19549 addReductionOps(TreeN);
19552 for (
Value *V : PossibleRedVals) {
19556 ++PossibleReducedVals[
Key][
Idx]
19557 .
insert(std::make_pair(V, 0))
19561 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19563 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19566 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19567 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19569 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19572 auto RedValsVect = It->second.takeVector();
19574 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19575 PossibleRedValsVect.
back().append(Data.second, Data.first);
19577 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19578 return P1.size() > P2.size();
19583 (!isGoodForReduction(Data) &&
19584 (!isa<LoadInst>(Data.front()) ||
19585 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19587 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19589 cast<LoadInst>(ReducedVals[NewIdx].front())
19591 NewIdx = ReducedVals.
size();
19594 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19609 constexpr unsigned RegMaxNumber = 4;
19610 constexpr unsigned RedValsMaxNumber = 128;
19614 if (
unsigned NumReducedVals = std::accumulate(
19615 ReducedVals.
begin(), ReducedVals.
end(), 0,
19617 if (!isGoodForReduction(Vals))
19619 return Num + Vals.size();
19621 NumReducedVals < ReductionLimit &&
19625 for (ReductionOpsType &RdxOps : ReductionOps)
19626 for (
Value *RdxOp : RdxOps)
19627 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19638 ReducedVals.
front().size());
19642 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19643 assert(isa<SelectInst>(RdxRootInst) &&
19644 "Expected min/max reduction to have select root instruction");
19645 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19646 assert(isa<Instruction>(ScalarCond) &&
19647 "Expected min/max reduction to have compare condition");
19648 return cast<Instruction>(ScalarCond);
19652 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19653 if (VectorizedTree) {
19656 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19657 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
19660 auto It = ReducedValsToOps.
find(Res);
19661 if (It != ReducedValsToOps.
end() &&
19667 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19673 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19674 return isBoolLogicOp(cast<Instruction>(V));
19677 ReductionOps.front().size());
19678 for (ReductionOpsType &RdxOps : ReductionOps)
19679 for (
Value *RdxOp : RdxOps) {
19682 IgnoreList.insert(RdxOp);
19687 for (
Value *U : IgnoreList)
19688 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19689 RdxFMF &= FPMO->getFastMathFlags();
19690 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19695 for (
Value *V : Candidates)
19696 TrackedVals.try_emplace(V, V);
19699 Value *
V) ->
unsigned & {
19700 auto *It = MV.
find(V);
19701 assert(It != MV.
end() &&
"Unable to find given key.");
19710 bool CheckForReusedReductionOps =
false;
19715 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19717 InstructionsState S = States[
I];
19721 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19722 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19727 auto *Inst = dyn_cast<Instruction>(RdxVal);
19729 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
19730 (S.getOpcode() && !Inst))
19733 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
19735 bool ShuffledExtracts =
false;
19737 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
19740 for (
Value *RV : ReducedVals[
I + 1]) {
19741 Value *RdxVal = TrackedVals.at(RV);
19745 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
19748 CommonCandidates.push_back(RdxVal);
19749 TrackedToOrig.try_emplace(RdxVal, RV);
19754 Candidates.
swap(CommonCandidates);
19755 ShuffledExtracts =
true;
19762 Value *OrigV = TrackedToOrig.at(Candidates.
front());
19763 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19765 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
19766 Value *OrigV = TrackedToOrig.at(VC);
19767 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
19768 if (
auto *ResI = dyn_cast<Instruction>(Res))
19769 V.analyzedReductionRoot(ResI);
19771 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
19775 unsigned NumReducedVals = Candidates.
size();
19776 if (NumReducedVals < ReductionLimit &&
19777 (NumReducedVals < 2 || !
isSplat(Candidates)))
19782 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
19783 RdxKind != RecurKind::FMul &&
19784 RdxKind != RecurKind::FMulAdd;
19787 if (IsSupportedHorRdxIdentityOp)
19788 for (
Value *V : Candidates) {
19789 Value *OrigV = TrackedToOrig.at(V);
19790 ++SameValuesCounter.
try_emplace(OrigV).first->second;
19802 bool SameScaleFactor =
false;
19803 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
19804 SameValuesCounter.
size() != Candidates.size();
19806 if (OptReusedScalars) {
19808 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
19809 RdxKind == RecurKind::Xor) &&
19811 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
19812 return P.second == SameValuesCounter.
front().second;
19814 Candidates.resize(SameValuesCounter.
size());
19815 transform(SameValuesCounter, Candidates.begin(),
19816 [&](
const auto &
P) { return TrackedVals.at(P.first); });
19817 NumReducedVals = Candidates.size();
19819 if (NumReducedVals == 1) {
19820 Value *OrigV = TrackedToOrig.at(Candidates.front());
19821 unsigned Cnt = At(SameValuesCounter, OrigV);
19823 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
19824 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
19825 VectorizedVals.try_emplace(OrigV, Cnt);
19826 ExternallyUsedValues.
insert(OrigV);
19831 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
19832 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
19833 const unsigned MaxElts = std::clamp<unsigned>(
19835 RegMaxNumber * RedValsMaxNumber);
19837 unsigned ReduxWidth = NumReducedVals;
19838 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
19839 unsigned NumParts, NumRegs;
19840 Type *ScalarTy = Candidates.front()->getType();
19847 while (NumParts > NumRegs) {
19848 ReduxWidth =
bit_floor(ReduxWidth - 1);
19854 if (NumParts > NumRegs / 2)
19859 ReduxWidth = GetVectorFactor(ReduxWidth);
19860 ReduxWidth = std::min(ReduxWidth, MaxElts);
19862 unsigned Start = 0;
19863 unsigned Pos = Start;
19865 unsigned PrevReduxWidth = ReduxWidth;
19866 bool CheckForReusedReductionOpsLocal =
false;
19867 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
19868 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
19869 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
19872 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
19875 if (Pos < NumReducedVals - ReduxWidth + 1)
19876 return IsAnyRedOpGathered;
19879 if (ReduxWidth > 1)
19880 ReduxWidth = GetVectorFactor(ReduxWidth);
19881 return IsAnyRedOpGathered;
19883 bool AnyVectorized =
false;
19885 while (Pos < NumReducedVals - ReduxWidth + 1 &&
19886 ReduxWidth >= ReductionLimit) {
19889 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
19891 CheckForReusedReductionOps =
true;
19894 PrevReduxWidth = ReduxWidth;
19897 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
19900 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
19902 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
19904 V.areAnalyzedReductionVals(VL)) {
19905 (void)AdjustReducedVals(
true);
19911 auto *RedValI = dyn_cast<Instruction>(RedVal);
19914 return V.isDeleted(RedValI);
19917 V.buildTree(VL, IgnoreList);
19918 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
19919 if (!AdjustReducedVals())
19920 V.analyzedReductionVals(VL);
19923 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
19924 if (!AdjustReducedVals())
19925 V.analyzedReductionVals(VL);
19928 V.reorderTopToBottom();
19930 V.reorderBottomToTop(
true);
19934 ExternallyUsedValues);
19938 LocalExternallyUsedValues.insert(ReductionRoot);
19939 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
19940 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
19942 for (
Value *V : ReducedVals[Cnt])
19943 if (isa<Instruction>(V))
19944 LocalExternallyUsedValues.insert(TrackedVals[V]);
19946 if (!IsSupportedHorRdxIdentityOp) {
19949 "Reused values counter map is not empty");
19950 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
19951 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
19953 Value *
V = Candidates[Cnt];
19954 Value *OrigV = TrackedToOrig.at(V);
19955 ++SameValuesCounter.
try_emplace(OrigV).first->second;
19958 V.transformNodes();
19962 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
19963 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
19965 Value *RdxVal = Candidates[Cnt];
19966 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
19967 RdxVal = It->second;
19968 if (!Visited.
insert(RdxVal).second)
19972 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
19973 LocalExternallyUsedValues.insert(RdxVal);
19976 Value *OrigV = TrackedToOrig.at(RdxVal);
19978 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
19979 if (NumOps != ReducedValsToOps.
at(OrigV).size())
19980 LocalExternallyUsedValues.insert(RdxVal);
19983 if (!IsSupportedHorRdxIdentityOp)
19984 SameValuesCounter.
clear();
19985 for (
Value *RdxVal : VL)
19986 if (RequiredExtract.
contains(RdxVal))
19987 LocalExternallyUsedValues.insert(RdxVal);
19988 V.buildExternalUses(LocalExternallyUsedValues);
19990 V.computeMinimumValueSizes();
19995 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
19998 <<
" for reduction\n");
20002 V.getORE()->emit([&]() {
20004 ReducedValsToOps.
at(VL[0]).front())
20005 <<
"Vectorizing horizontal reduction is possible "
20006 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20007 <<
" and threshold "
20010 if (!AdjustReducedVals()) {
20011 V.analyzedReductionVals(VL);
20012 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20013 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20016 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20017 VF >= ReductionLimit;
20019 *
TTI, VL.front()->getType(), VF - 1)) {
20021 V.getCanonicalGraphSize() !=
V.getTreeSize())
20023 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20031 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20032 <<
Cost <<
". (HorRdx)\n");
20033 V.getORE()->emit([&]() {
20035 ReducedValsToOps.
at(VL[0]).front())
20036 <<
"Vectorized horizontal reduction with cost "
20037 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20038 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20045 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20047 if (IsCmpSelMinMax)
20048 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20051 Value *VectorizedRoot =
20052 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20055 for (
Value *RdxVal : Candidates) {
20056 Value *OrigVal = TrackedToOrig.at(RdxVal);
20057 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20058 if (TransformedRdxVal != RdxVal)
20059 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20068 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20071 if (OptReusedScalars && !SameScaleFactor) {
20072 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20073 SameValuesCounter, TrackedToOrig);
20076 Value *ReducedSubTree;
20077 Type *ScalarTy = VL.front()->getType();
20078 if (isa<FixedVectorType>(ScalarTy)) {
20083 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20101 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20104 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20107 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20108 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20109 "Expected different reduction type.");
20111 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20112 V.isSignedMinBitwidthRootNode());
20118 if (OptReusedScalars && SameScaleFactor)
20119 ReducedSubTree = emitScaleForReusedOps(
20120 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20122 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20124 for (
Value *RdxVal : VL) {
20125 Value *OrigV = TrackedToOrig.at(RdxVal);
20126 if (IsSupportedHorRdxIdentityOp) {
20127 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20130 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20131 if (!
V.isVectorized(RdxVal))
20132 RequiredExtract.
insert(RdxVal);
20136 ReduxWidth = NumReducedVals - Pos;
20137 if (ReduxWidth > 1)
20138 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20139 AnyVectorized =
true;
20141 if (OptReusedScalars && !AnyVectorized) {
20142 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20143 Value *RdxVal = TrackedVals.at(
P.first);
20144 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20145 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20146 VectorizedVals.try_emplace(
P.first,
P.second);
20151 if (VectorizedTree) {
20172 if (!AnyBoolLogicOp)
20174 if (isBoolLogicOp(RedOp1) &&
20175 ((!InitStep &&
LHS == VectorizedTree) ||
20178 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20179 getRdxOperand(RedOp2, 0) ==
RHS ||
20184 if (
LHS != VectorizedTree)
20195 unsigned Sz = InstVals.
size();
20198 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20201 Value *RdxVal1 = InstVals[
I].second;
20202 Value *StableRdxVal1 = RdxVal1;
20203 auto It1 = TrackedVals.find(RdxVal1);
20204 if (It1 != TrackedVals.end())
20205 StableRdxVal1 = It1->second;
20206 Value *RdxVal2 = InstVals[
I + 1].second;
20207 Value *StableRdxVal2 = RdxVal2;
20208 auto It2 = TrackedVals.find(RdxVal2);
20209 if (It2 != TrackedVals.end())
20210 StableRdxVal2 = It2->second;
20214 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20216 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20217 StableRdxVal2,
"op.rdx", ReductionOps);
20218 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20221 ExtraReds[Sz / 2] = InstVals.
back();
20225 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20229 for (
Value *RdxVal : Candidates) {
20230 if (!Visited.
insert(RdxVal).second)
20232 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20239 bool InitStep =
true;
20240 while (ExtraReductions.
size() > 1) {
20242 FinalGen(ExtraReductions, InitStep);
20243 ExtraReductions.
swap(NewReds);
20246 VectorizedTree = ExtraReductions.
front().second;
20248 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20257 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20264 for (
auto *U :
Ignore->users()) {
20266 "All users must be either in the reduction ops list.");
20269 if (!
Ignore->use_empty()) {
20271 Ignore->replaceAllUsesWith(
P);
20274 V.removeInstructionsAndOperands(RdxOps);
20276 }
else if (!CheckForReusedReductionOps) {
20277 for (ReductionOpsType &RdxOps : ReductionOps)
20278 for (
Value *RdxOp : RdxOps)
20279 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20281 return VectorizedTree;
20291 Type *ScalarTy = ReducedVals.
front()->getType();
20292 unsigned ReduxWidth = ReducedVals.
size();
20301 int Cnt = ReducedVals.
size();
20302 for (
Value *RdxVal : ReducedVals) {
20307 Cost += GenCostFn();
20312 auto *RdxOp = cast<Instruction>(U);
20313 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20321 Cost += ScalarCost;
20323 Cost += GenCostFn();
20328 case RecurKind::Add:
20329 case RecurKind::Mul:
20330 case RecurKind::Or:
20331 case RecurKind::And:
20332 case RecurKind::Xor:
20333 case RecurKind::FAdd:
20334 case RecurKind::FMul: {
20337 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20340 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20352 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20353 std::make_pair(RedTy,
true));
20354 if (RType == RedTy) {
20364 ScalarCost = EvaluateScalarCost([&]() {
20369 case RecurKind::FMax:
20370 case RecurKind::FMin:
20371 case RecurKind::FMaximum:
20372 case RecurKind::FMinimum:
20373 case RecurKind::SMax:
20374 case RecurKind::SMin:
20375 case RecurKind::UMax:
20376 case RecurKind::UMin: {
20380 ScalarCost = EvaluateScalarCost([&]() {
20390 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20392 <<
" (It is a splitting reduction)\n");
20393 return VectorCost - ScalarCost;
20399 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20400 assert(RdxKind != RecurKind::FMulAdd &&
20401 "A call to the llvm.fmuladd intrinsic is not handled yet");
20403 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20404 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20405 RdxKind == RecurKind::Add &&
20410 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20411 ++NumVectorInstructions;
20414 ++NumVectorInstructions;
20421 assert(IsSupportedHorRdxIdentityOp &&
20422 "The optimization of matched scalar identity horizontal reductions "
20423 "must be supported.");
20425 return VectorizedValue;
20427 case RecurKind::Add: {
20429 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20431 << VectorizedValue <<
". (HorRdx)\n");
20432 return Builder.
CreateMul(VectorizedValue, Scale);
20434 case RecurKind::Xor: {
20436 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20437 <<
". (HorRdx)\n");
20440 return VectorizedValue;
20442 case RecurKind::FAdd: {
20444 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20446 << VectorizedValue <<
". (HorRdx)\n");
20447 return Builder.
CreateFMul(VectorizedValue, Scale);
20449 case RecurKind::And:
20450 case RecurKind::Or:
20451 case RecurKind::SMax:
20452 case RecurKind::SMin:
20453 case RecurKind::UMax:
20454 case RecurKind::UMin:
20455 case RecurKind::FMax:
20456 case RecurKind::FMin:
20457 case RecurKind::FMaximum:
20458 case RecurKind::FMinimum:
20460 return VectorizedValue;
20461 case RecurKind::Mul:
20462 case RecurKind::FMul:
20463 case RecurKind::FMulAdd:
20464 case RecurKind::IAnyOf:
20465 case RecurKind::FAnyOf:
20466 case RecurKind::IFindLastIV:
20467 case RecurKind::FFindLastIV:
20468 case RecurKind::None:
20480 assert(IsSupportedHorRdxIdentityOp &&
20481 "The optimization of matched scalar identity horizontal reductions "
20482 "must be supported.");
20484 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20485 if (VTy->getElementType() != VL.
front()->getType()) {
20489 R.isSignedMinBitwidthRootNode());
20492 case RecurKind::Add: {
20495 for (
Value *V : VL) {
20496 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20497 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20501 << VectorizedValue <<
". (HorRdx)\n");
20502 return Builder.
CreateMul(VectorizedValue, Scale);
20504 case RecurKind::And:
20505 case RecurKind::Or:
20508 <<
". (HorRdx)\n");
20509 return VectorizedValue;
20510 case RecurKind::SMax:
20511 case RecurKind::SMin:
20512 case RecurKind::UMax:
20513 case RecurKind::UMin:
20514 case RecurKind::FMax:
20515 case RecurKind::FMin:
20516 case RecurKind::FMaximum:
20517 case RecurKind::FMinimum:
20520 <<
". (HorRdx)\n");
20521 return VectorizedValue;
20522 case RecurKind::Xor: {
20528 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20530 std::iota(
Mask.begin(),
Mask.end(), 0);
20531 bool NeedShuffle =
false;
20532 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20534 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20535 if (Cnt % 2 == 0) {
20537 NeedShuffle =
true;
20543 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20547 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20548 return VectorizedValue;
20550 case RecurKind::FAdd: {
20553 for (
Value *V : VL) {
20554 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20555 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20558 return Builder.
CreateFMul(VectorizedValue, Scale);
20560 case RecurKind::Mul:
20561 case RecurKind::FMul:
20562 case RecurKind::FMulAdd:
20563 case RecurKind::IAnyOf:
20564 case RecurKind::FAnyOf:
20565 case RecurKind::IFindLastIV:
20566 case RecurKind::FFindLastIV:
20567 case RecurKind::None:
20577 return HorizontalReduction::getRdxKind(V);
20580 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20581 return cast<FixedVectorType>(IE->getType())->getNumElements();
20583 unsigned AggregateSize = 1;
20584 auto *
IV = cast<InsertValueInst>(InsertInst);
20585 Type *CurrentType =
IV->getType();
20587 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20588 for (
auto *Elt : ST->elements())
20589 if (Elt != ST->getElementType(0))
20590 return std::nullopt;
20591 AggregateSize *= ST->getNumElements();
20592 CurrentType = ST->getElementType(0);
20593 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20594 AggregateSize *= AT->getNumElements();
20595 CurrentType = AT->getElementType();
20596 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20597 AggregateSize *= VT->getNumElements();
20598 return AggregateSize;
20600 return AggregateSize;
20602 return std::nullopt;
20611 unsigned OperandOffset,
const BoUpSLP &R) {
20614 std::optional<unsigned> OperandIndex =
20616 if (!OperandIndex || R.isDeleted(LastInsertInst))
20618 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20620 BuildVectorOpds, InsertElts, *OperandIndex, R);
20623 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20624 InsertElts[*OperandIndex] = LastInsertInst;
20626 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20627 }
while (LastInsertInst !=
nullptr &&
20628 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20652 assert((isa<InsertElementInst>(LastInsertInst) ||
20653 isa<InsertValueInst>(LastInsertInst)) &&
20654 "Expected insertelement or insertvalue instruction!");
20657 "Expected empty result vectors!");
20660 if (!AggregateSize)
20662 BuildVectorOpds.
resize(*AggregateSize);
20663 InsertElts.
resize(*AggregateSize);
20669 if (BuildVectorOpds.
size() >= 2)
20687 auto DominatedReduxValue = [&](
Value *R) {
20688 return isa<Instruction>(R) &&
20689 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20695 if (
P->getIncomingBlock(0) == ParentBB) {
20696 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20697 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20698 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20701 if (Rdx && DominatedReduxValue(Rdx))
20714 if (
P->getIncomingBlock(0) == BBLatch) {
20715 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20716 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20717 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20720 if (Rdx && DominatedReduxValue(Rdx))
20754 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
20755 isa<IntrinsicInst>(Root)) &&
20756 "Expected binop, select, or intrinsic for reduction matching");
20758 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
20760 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
20762 return dyn_cast<Instruction>(
RHS);
20764 return dyn_cast<Instruction>(
LHS);
20771 Value *Op0 =
nullptr;
20772 Value *Op1 =
nullptr;
20775 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
20781 Value *B0 =
nullptr, *B1 =
nullptr;
20786bool SLPVectorizerPass::vectorizeHorReduction(
20791 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
20793 if (Root->
getParent() != BB || isa<PHINode>(Root))
20797 auto SelectRoot = [&]() {
20816 std::queue<std::pair<Instruction *, unsigned>>
Stack;
20817 Stack.emplace(SelectRoot(), 0);
20821 if (
R.isAnalyzedReductionRoot(Inst))
20826 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
20828 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI);
20830 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
20831 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
20838 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
20843 while (!
Stack.empty()) {
20846 std::tie(Inst, Level) =
Stack.front();
20851 if (
R.isDeleted(Inst))
20853 if (
Value *VectorizedV = TryToReduce(Inst)) {
20855 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
20857 Stack.emplace(
I, Level);
20860 if (
R.isDeleted(Inst))
20864 if (!TryAppendToPostponedInsts(Inst)) {
20875 if (VisitedInstrs.
insert(
Op).second)
20876 if (
auto *
I = dyn_cast<Instruction>(
Op))
20879 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
20880 !
R.isDeleted(
I) &&
I->getParent() == BB)
20881 Stack.emplace(
I, Level);
20889 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
20890 Res |= tryToVectorize(PostponedInsts, R);
20897 for (
Value *V : Insts)
20898 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
20899 Res |= tryToVectorize(Inst, R);
20903bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
20906 if (!
R.canMapToVector(IVI->
getType()))
20914 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
20915 R.getORE()->emit([&]() {
20917 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
20918 "trying reduction first.";
20922 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
20924 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
20934 (
llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
20938 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
20939 R.getORE()->emit([&]() {
20941 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
20942 "trying reduction first.";
20946 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
20947 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
20950template <
typename T>
20955 bool MaxVFOnly,
BoUpSLP &R) {
20956 bool Changed =
false;
20967 auto *
I = dyn_cast<Instruction>(*IncIt);
20968 if (!
I || R.isDeleted(
I)) {
20972 auto *SameTypeIt = IncIt;
20973 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
20974 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
20975 AreCompatible(*SameTypeIt, *IncIt))) {
20976 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
20978 if (
I && !R.isDeleted(
I))
20983 unsigned NumElts = VL.
size();
20984 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
20985 << NumElts <<
")\n");
20995 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
20998 VL.
swap(Candidates);
20999 Candidates.
clear();
21001 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21007 auto GetMinNumElements = [&R](
Value *V) {
21008 unsigned EltSize = R.getVectorElementSize(V);
21009 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21011 if (NumElts < GetMinNumElements(*IncIt) &&
21012 (Candidates.
empty() ||
21013 Candidates.
front()->getType() == (*IncIt)->getType())) {
21015 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21021 if (Candidates.
size() > 1 &&
21022 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21023 if (TryToVectorizeHelper(Candidates,
false)) {
21026 }
else if (MaxVFOnly) {
21029 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21031 auto *
I = dyn_cast<Instruction>(*It);
21032 if (!
I || R.isDeleted(
I)) {
21036 auto *SameTypeIt = It;
21037 while (SameTypeIt !=
End &&
21038 (!isa<Instruction>(*SameTypeIt) ||
21039 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21040 AreCompatible(*SameTypeIt, *It))) {
21041 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21043 if (
I && !R.isDeleted(
I))
21046 unsigned NumElts = VL.
size();
21047 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21053 Candidates.
clear();
21057 IncIt = SameTypeIt;
21069template <
bool IsCompatibility>
21074 "Expected valid element types only.");
21076 return IsCompatibility;
21077 auto *CI1 = cast<CmpInst>(V);
21078 auto *CI2 = cast<CmpInst>(V2);
21079 if (CI1->getOperand(0)->getType()->getTypeID() <
21081 return !IsCompatibility;
21082 if (CI1->getOperand(0)->getType()->getTypeID() >
21085 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21087 return !IsCompatibility;
21088 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21097 if (BasePred1 < BasePred2)
21098 return !IsCompatibility;
21099 if (BasePred1 > BasePred2)
21102 bool CI1Preds = Pred1 == BasePred1;
21103 bool CI2Preds = Pred2 == BasePred1;
21104 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21105 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21106 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21110 return !IsCompatibility;
21113 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21114 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21115 if (IsCompatibility) {
21116 if (I1->getParent() != I2->getParent())
21123 return NodeI2 !=
nullptr;
21126 assert((NodeI1 == NodeI2) ==
21128 "Different nodes should have different DFS numbers");
21129 if (NodeI1 != NodeI2)
21133 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
21135 if (IsCompatibility)
21137 if (I1->getOpcode() != I2->getOpcode())
21138 return I1->getOpcode() < I2->getOpcode();
21141 return IsCompatibility;
21144template <
typename ItT>
21147 bool Changed =
false;
21150 if (
R.isDeleted(
I))
21153 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21154 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21155 if (
R.isDeleted(
I))
21161 if (
R.isDeleted(
I))
21163 Changed |= tryToVectorize(
I, R);
21170 return compareCmp<false>(V, V2, *TLI, *DT);
21173 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21176 return compareCmp<true>(V1, V2, *TLI, *DT);
21183 if (Vals.
size() <= 1)
21185 Changed |= tryToVectorizeSequence<Value>(
21186 Vals, CompareSorter, AreCompatibleCompares,
21189 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21191 auto *Select = dyn_cast<SelectInst>(U);
21193 Select->getParent() != cast<Instruction>(V)->getParent();
21196 if (ArePossiblyReducedInOtherBlock)
21198 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21204bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21206 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21207 "This function only accepts Insert instructions");
21208 bool OpsChanged =
false;
21210 for (
auto *
I :
reverse(Instructions)) {
21212 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21214 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21216 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21217 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21219 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21222 if (
R.isDeleted(
I))
21224 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21225 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21228 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21230 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21231 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21232 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21237 OpsChanged |= tryToVectorize(PostponedInsts, R);
21244 bool Changed =
false;
21251 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21254 "Expected vectorizable types only.");
21262 V2->getType()->getScalarSizeInBits())
21265 V2->getType()->getScalarSizeInBits())
21269 if (Opcodes1.
size() < Opcodes2.
size())
21271 if (Opcodes1.
size() > Opcodes2.
size())
21273 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21276 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21277 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21282 return NodeI2 !=
nullptr;
21285 assert((NodeI1 == NodeI2) ==
21287 "Different nodes should have different DFS numbers");
21288 if (NodeI1 != NodeI2)
21291 if (S.getOpcode() && !S.isAltShuffle())
21293 return I1->getOpcode() < I2->getOpcode();
21302 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21303 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21311 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21312 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21316 auto ValID1 = Opcodes1[
I]->getValueID();
21317 auto ValID2 = Opcodes2[
I]->getValueID();
21318 if (ValID1 == ValID2)
21320 if (ValID1 < ValID2)
21322 if (ValID1 > ValID2)
21331 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21335 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21338 if (V1->getType() !=
V2->getType())
21342 if (Opcodes1.
size() != Opcodes2.
size())
21344 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21346 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21348 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21349 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21350 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21352 if (
I1->getParent() != I2->getParent())
21359 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21361 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21367 bool HaveVectorizedPhiNodes =
false;
21372 auto *
P = dyn_cast<PHINode>(&
I);
21378 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21391 if (!Opcodes.
empty())
21395 while (!Nodes.
empty()) {
21396 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21399 for (
Value *V :
PHI->incoming_values()) {
21400 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21401 Nodes.push_back(PHI1);
21409 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21410 Incoming, PHICompare, AreCompatiblePHIs,
21412 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21415 Changed |= HaveVectorizedPhiNodes;
21416 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21417 auto *
PHI = dyn_cast<PHINode>(
P.first);
21418 return !
PHI ||
R.isDeleted(
PHI);
21420 PHIToOpcodes.
clear();
21422 }
while (HaveVectorizedPhiNodes);
21424 VisitedInstrs.
clear();
21426 InstSetVector PostProcessInserts;
21430 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21431 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21432 if (VectorizeCmps) {
21433 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21434 PostProcessCmps.
clear();
21436 PostProcessInserts.clear();
21441 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21442 return PostProcessCmps.
contains(Cmp);
21443 return isa<InsertElementInst, InsertValueInst>(
I) &&
21444 PostProcessInserts.contains(
I);
21450 return I->use_empty() &&
21451 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21456 if (isa<ScalableVectorType>(It->getType()))
21460 if (
R.isDeleted(&*It))
21463 if (!VisitedInstrs.
insert(&*It).second) {
21464 if (HasNoUsers(&*It) &&
21465 VectorizeInsertsAndCmps(It->isTerminator())) {
21475 if (isa<DbgInfoIntrinsic>(It))
21479 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21481 if (
P->getNumIncomingValues() == 2) {
21484 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21493 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21498 if (BB ==
P->getIncomingBlock(
I) ||
21504 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21505 PI && !IsInPostProcessInstrs(PI)) {
21507 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21509 if (Res &&
R.isDeleted(
P)) {
21519 if (HasNoUsers(&*It)) {
21520 bool OpsChanged =
false;
21521 auto *
SI = dyn_cast<StoreInst>(It);
21531 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21532 SI->getValueOperand()->hasOneUse();
21534 if (TryToVectorizeRoot) {
21535 for (
auto *V : It->operand_values()) {
21538 if (
auto *VI = dyn_cast<Instruction>(V);
21539 VI && !IsInPostProcessInstrs(VI))
21541 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21548 VectorizeInsertsAndCmps(It->isTerminator());
21559 if (isa<InsertElementInst, InsertValueInst>(It))
21560 PostProcessInserts.insert(&*It);
21561 else if (isa<CmpInst>(It))
21562 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21569 auto Changed =
false;
21570 for (
auto &Entry : GEPs) {
21573 if (
Entry.second.size() < 2)
21576 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21577 <<
Entry.second.size() <<
".\n");
21585 return !R.isDeleted(GEP);
21587 if (It ==
Entry.second.end())
21589 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21590 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21591 if (MaxVecRegSize < EltSize)
21594 unsigned MaxElts = MaxVecRegSize / EltSize;
21595 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21596 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21609 Candidates.remove_if([&R](
Value *
I) {
21610 return R.isDeleted(cast<Instruction>(
I)) ||
21611 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21619 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21620 auto *GEPI = GEPList[
I];
21621 if (!Candidates.count(GEPI))
21624 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21625 auto *GEPJ = GEPList[J];
21627 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21628 Candidates.remove(GEPI);
21629 Candidates.remove(GEPJ);
21630 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21631 Candidates.remove(GEPJ);
21638 if (Candidates.
size() < 2)
21645 auto BundleIndex = 0
u;
21646 for (
auto *V : Candidates) {
21647 auto *
GEP = cast<GetElementPtrInst>(V);
21648 auto *GEPIdx =
GEP->idx_begin()->get();
21649 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21650 Bundle[BundleIndex++] = GEPIdx;
21662 Changed |= tryToVectorizeList(Bundle, R);
21668bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21669 bool Changed =
false;
21674 if (
V->getValueOperand()->getType()->getTypeID() <
21675 V2->getValueOperand()->getType()->getTypeID())
21677 if (
V->getValueOperand()->getType()->getTypeID() >
21678 V2->getValueOperand()->getType()->getTypeID())
21680 if (
V->getPointerOperandType()->getTypeID() <
21681 V2->getPointerOperandType()->getTypeID())
21683 if (
V->getPointerOperandType()->getTypeID() >
21684 V2->getPointerOperandType()->getTypeID())
21686 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21687 V2->getValueOperand()->getType()->getScalarSizeInBits())
21689 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21690 V2->getValueOperand()->getType()->getScalarSizeInBits())
21693 if (isa<UndefValue>(
V->getValueOperand()) ||
21694 isa<UndefValue>(
V2->getValueOperand()))
21696 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21697 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21701 DT->
getNode(I2->getParent());
21702 assert(NodeI1 &&
"Should only process reachable instructions");
21703 assert(NodeI2 &&
"Should only process reachable instructions");
21704 assert((NodeI1 == NodeI2) ==
21706 "Different nodes should have different DFS numbers");
21707 if (NodeI1 != NodeI2)
21712 return I1->getOpcode() < I2->getOpcode();
21714 if (isa<Constant>(
V->getValueOperand()) &&
21715 isa<Constant>(
V2->getValueOperand()))
21717 return V->getValueOperand()->getValueID() <
21718 V2->getValueOperand()->getValueID();
21730 isa<UndefValue>(
V2->getValueOperand()))
21733 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21734 if (
I1->getParent() != I2->getParent())
21737 return S.getOpcode() > 0;
21740 isa<Constant>(
V2->getValueOperand()))
21743 V2->getValueOperand()->getValueID();
21748 for (
auto &Pair : Stores) {
21749 if (Pair.second.size() < 2)
21753 << Pair.second.size() <<
".\n");
21762 Pair.second.rend());
21763 Changed |= tryToVectorizeSequence<StoreInst>(
21764 ReversedStores, StoreSorter, AreCompatibleStores,
21766 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, Instruction *VL0, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.