74#ifdef EXPENSIVE_CHECKS
109using namespace std::placeholders;
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
114STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115STATISTIC(NumStridedStoreChains,
"Number of vectorized stride stores");
116STATISTIC(NumStoreChains,
"Number of vector stores created");
117STATISTIC(NumVectorizedStores,
"Number of vectorized stores");
120 "Controls which SLP graphs should be vectorized.");
124 cl::desc(
"Run the SLP vectorization passes"));
128 cl::desc(
"Enable vectorization for wider vector utilization"));
132 cl::desc(
"Only vectorize if you gain more than this "
137 cl::desc(
"Attempt to vectorize horizontal reductions"));
142 "Attempt to vectorize horizontal reductions feeding into a store"));
146 cl::desc(
"Improve the code quality by splitting alternate instructions"));
150 cl::desc(
"Reject vectorization if vector instruction count exceeds "
151 "scalar instruction count"));
155 cl::desc(
"Attempt to vectorize for this register size in bits"));
159 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
167 cl::desc(
"Limit the size of the SLP scheduling region per block"));
171 cl::desc(
"Attempt to vectorize for this register size in bits"));
175 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
179 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
185 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
194 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
198 cl::desc(
"The minimum number of loads, which should be considered strided, "
199 "if the stride is > 1 or is runtime value"));
204 "The minimum number of stores, which should be considered strided, "
205 "if the stride is > 1 or is runtime value"));
209 cl::desc(
"The maximum stride, considered to be profitable."));
214 cl::desc(
"Enable SLP trees to be built from strided "
219 cl::desc(
"Disable tree reordering even if it is "
220 "profitable. Used for testing only."));
224 cl::desc(
"Generate strided loads even if they are not "
225 "profitable. Used for testing only."));
229 cl::desc(
"Display the SLP trees with Graphviz"));
233 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
237 cl::desc(
"Force vectorization of non-vectorizable stores operands."));
242 "Use non-vectorizable instructions as potential reduction roots."));
254 cl::desc(
"Try to replace values with the idempotent instructions for "
255 "better vectorization."));
259 cl::desc(
"Loop trip count, considered by the cost model during "
260 "modeling (0=loops are ignored and considered flat code)"));
271 cl::desc(
"Use per-lane execution scale for gather/buildvector tree "
272 "entries to model LICM-hoistable buildvector sequences."));
304 Ty = Ty->getScalarType();
306 !Ty->isPPC_FP128Ty();
317 return SI->getValueOperand()->getType();
320 return CI->getOperand(0)->getType();
323 return IE->getOperand(1)->getType();
330 "ScalableVectorType is not supported.");
332 return VecTy->getNumElements();
346 Type *Ty,
unsigned Sz) {
351 if (NumParts == 0 || NumParts >= Sz)
366 if (NumParts == 0 || NumParts >= Sz)
371 return (Sz / RegVF) * RegVF;
383 I * VecTyNumElements, VecTyNumElements)))
385 : Mask[
I] * VecTyNumElements + J;
419 unsigned SVNumElements =
421 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
422 if (SVNumElements % ShuffleMaskSize != 0)
424 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
425 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
427 unsigned NumGroup = 0;
428 for (
size_t I = 0,
E = VL.
size();
I !=
E;
I += GroupSize) {
430 Value *Src = SV->getOperand(0);
436 if (SV->getOperand(0) != Src)
439 if (!SV->isExtractSubvectorMask(Index))
441 ExpectedIndex.
set(Index / ShuffleMaskSize);
445 if (!ExpectedIndex.
all())
449 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
468 unsigned SVNumElements =
471 unsigned AccumulateLength = 0;
472 for (
Value *V : VL) {
474 for (
int M : SV->getShuffleMask())
476 : AccumulateLength + M);
477 AccumulateLength += SVNumElements;
518 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
527 OS <<
"Idx: " << Idx <<
", ";
528 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
551 if (BB !=
II->getParent())
568 Value *FirstNonUndef =
nullptr;
569 for (
Value *V : VL) {
572 if (!FirstNonUndef) {
576 if (V != FirstNonUndef)
579 return FirstNonUndef !=
nullptr;
594 bool IsCopyable =
false) {
596 return Cmp->isCommutative();
598 return BO->isCommutative() ||
599 (BO->getOpcode() == Instruction::Sub &&
607 if (match(U.getUser(),
608 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
609 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
613 auto *I = dyn_cast<BinaryOperator>(U.get());
614 return match(U.getUser(),
615 m_Intrinsic<Intrinsic::abs>(
616 m_Specific(U.get()), m_ConstantInt(Flag))) &&
617 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
620 (BO->getOpcode() == Instruction::FSub &&
624 return match(U.getUser(),
625 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
627 return I->isCommutative();
634 bool IsCopyable =
false) {
636 "The instruction is not commutative.");
640 switch (BO->getOpcode()) {
641 case Instruction::Sub:
642 case Instruction::FSub:
648 return I->isCommutableOperand(
Op);
668 constexpr unsigned IntrinsicNumOperands = 2;
669 return IntrinsicNumOperands;
671 return I->getNumOperands();
677 static_assert(std::is_same_v<T, InsertElementInst> ||
678 std::is_same_v<T, ExtractElementInst>,
688 if (CI->getValue().uge(VT->getNumElements()))
690 Index *= VT->getNumElements();
691 Index += CI->getZExtValue();
713 Type *CurrentType =
IV->getType();
714 for (
unsigned I :
IV->indices()) {
716 Index *= ST->getNumElements();
717 CurrentType = ST->getElementType(
I);
719 Index *= AT->getNumElements();
720 CurrentType = AT->getElementType();
742 return std::all_of(It, VL.
end(), [&](
Value *V) {
743 if (auto *CI = dyn_cast<CmpInst>(V))
744 return BasePred == CI->getPredicate();
745 if (auto *I = dyn_cast<Instruction>(V))
746 return I->getOpcode() == Opcode;
747 return isa<PoisonValue>(V);
775 if (MaskArg == UseMask::UndefsAsMask)
779 if (MaskArg == UseMask::FirstArg &&
Value < VF)
780 UseMask.reset(
Value);
781 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
782 UseMask.reset(
Value - VF);
790template <
bool IsPoisonOnly = false>
794 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
802 if (!UseMask.empty()) {
813 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
828 for (
unsigned I = 0,
E = VecTy->getNumElements();
I !=
E; ++
I) {
829 if (
Constant *Elem =
C->getAggregateElement(
I))
831 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
859static std::optional<TargetTransformInfo::ShuffleKind>
872 return std::max(S, VTy->getNumElements());
875 Value *Vec1 =
nullptr;
876 Value *Vec2 =
nullptr;
881 Value *Vec = EE->getVectorOperand();
887 ShuffleMode CommonShuffleMode =
Unknown;
889 for (
unsigned I = 0,
E = VL.
size();
I <
E; ++
I) {
896 auto *Vec = EI->getVectorOperand();
910 if (Idx->getValue().uge(
Size))
912 unsigned IntIdx = Idx->getValue().getZExtValue();
919 if (!Vec1 || Vec1 == Vec) {
921 }
else if (!Vec2 || Vec2 == Vec) {
927 if (CommonShuffleMode == Permute)
931 if (Mask[
I] %
Size !=
I) {
932 CommonShuffleMode = Permute;
935 CommonShuffleMode =
Select;
938 if (CommonShuffleMode ==
Select && Vec2)
948 unsigned Opcode =
E->getOpcode();
949 assert((Opcode == Instruction::ExtractElement ||
950 Opcode == Instruction::ExtractValue) &&
951 "Expected extractelement or extractvalue instruction.");
952 if (Opcode == Instruction::ExtractElement) {
958 unsigned Idx = CI->getZExtValue();
966 if (EI->getNumIndices() != 1)
968 return *EI->idx_begin();
1002class BinOpSameOpcodeHelper {
1003 using MaskType = std::uint_fast32_t;
1005 constexpr static std::initializer_list<unsigned> SupportedOp = {
1006 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
1007 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
1009 "SupportedOp is not sorted.");
1027 static std::pair<ConstantInt *, unsigned>
1028 isBinOpWithConstantInt(
const Instruction *
I) {
1029 unsigned Opcode =
I->getOpcode();
1035 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
1036 Opcode == Instruction::AShr)
1037 return {
nullptr, 0};
1040 return {
nullptr, 0};
1042 struct InterchangeableInfo {
1045 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
1046 MulBIT | AShrBIT | ShlBIT;
1051 MaskType SeenBefore = 0;
1052 InterchangeableInfo(
const Instruction *I) : I(I) {}
1056 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1057 if (Mask & InterchangeableMask) {
1058 SeenBefore |= OpcodeInMaskForm;
1059 Mask &= InterchangeableMask;
1064 bool equal(
unsigned Opcode) {
1065 return Opcode == I->getOpcode() && trySet(MainOpBIT, MainOpBIT);
1067 unsigned getOpcode()
const {
1068 MaskType Candidate = Mask & SeenBefore;
1069 if (Candidate & MainOpBIT)
1070 return I->getOpcode();
1071 if (Candidate & ShlBIT)
1072 return Instruction::Shl;
1073 if (Candidate & AShrBIT)
1074 return Instruction::AShr;
1075 if (Candidate & MulBIT)
1076 return Instruction::Mul;
1077 if (Candidate & AddBIT)
1078 return Instruction::Add;
1079 if (Candidate & SubBIT)
1080 return Instruction::Sub;
1081 if (Candidate & AndBIT)
1082 return Instruction::And;
1083 if (Candidate & OrBIT)
1084 return Instruction::Or;
1085 if (Candidate & XorBIT)
1086 return Instruction::Xor;
1090 bool hasDefinedOpcode()
const {
return (Mask & SeenBefore) > 0; }
1093 bool hasCandidateOpcode(
unsigned Opcode)
const {
1094 MaskType Candidate = Mask & SeenBefore;
1096 case Instruction::Shl:
1097 return Candidate & ShlBIT;
1098 case Instruction::AShr:
1099 return Candidate & AShrBIT;
1100 case Instruction::Mul:
1101 return Candidate & MulBIT;
1102 case Instruction::Add:
1103 return Candidate & AddBIT;
1104 case Instruction::Sub:
1105 return Candidate & SubBIT;
1106 case Instruction::And:
1107 return Candidate & AndBIT;
1108 case Instruction::Or:
1109 return Candidate & OrBIT;
1110 case Instruction::Xor:
1111 return Candidate & XorBIT;
1112 case Instruction::LShr:
1113 case Instruction::FAdd:
1114 case Instruction::FSub:
1115 case Instruction::FMul:
1116 case Instruction::SDiv:
1117 case Instruction::UDiv:
1118 case Instruction::FDiv:
1119 case Instruction::SRem:
1120 case Instruction::URem:
1121 case Instruction::FRem:
1131 unsigned FromOpcode = I->getOpcode();
1132 if (FromOpcode == ToOpcode)
1135 auto [CI, Pos] = isBinOpWithConstantInt(I);
1136 const APInt &FromCIValue = CI->getValue();
1137 unsigned FromCIValueBitWidth = FromCIValue.
getBitWidth();
1138 Type *RHSType = I->getOperand(Pos)->getType();
1140 switch (FromOpcode) {
1141 case Instruction::Shl:
1142 if (ToOpcode == Instruction::Add && FromCIValue.
isOne())
1143 return {I->getOperand(0), I->getOperand(0)};
1144 if (ToOpcode == Instruction::Mul) {
1145 RHS = ConstantInt::get(
1149 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1154 case Instruction::Mul:
1156 if (ToOpcode == Instruction::Shl) {
1157 RHS = ConstantInt::get(
1158 RHSType, APInt(FromCIValueBitWidth, FromCIValue.
logBase2()));
1160 assert(FromCIValue.
isOne() &&
"Cannot convert the instruction.");
1165 case Instruction::Add:
1166 case Instruction::Sub:
1167 if (FromCIValue.
isZero()) {
1172 "Cannot convert the instruction.");
1173 APInt NegatedVal = APInt(FromCIValue);
1174 NegatedVal.negate();
1175 RHS = ConstantInt::get(RHSType, NegatedVal);
1178 case Instruction::And:
1184 assert(FromCIValue.
isZero() &&
"Cannot convert the instruction.");
1189 Value *
LHS = I->getOperand(1 - Pos);
1198 InterchangeableInfo MainOp;
1199 InterchangeableInfo AltOp;
1201 return ::isValidForAlternation(MainOp.I->getOpcode()) &&
1204 bool initializeAltOp(
const Instruction *
I) {
1214 BinOpSameOpcodeHelper(
const Instruction *MainOp,
1215 const Instruction *AltOp =
nullptr)
1216 : MainOp(MainOp), AltOp(AltOp) {}
1217 bool add(
const Instruction *
I) {
1219 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1220 unsigned Opcode =
I->getOpcode();
1221 MaskType OpcodeInMaskForm;
1224 case Instruction::Shl:
1225 OpcodeInMaskForm = ShlBIT;
1227 case Instruction::AShr:
1228 OpcodeInMaskForm = AShrBIT;
1230 case Instruction::Mul:
1231 OpcodeInMaskForm = MulBIT;
1233 case Instruction::Add:
1234 OpcodeInMaskForm = AddBIT;
1236 case Instruction::Sub:
1237 OpcodeInMaskForm = SubBIT;
1239 case Instruction::And:
1240 OpcodeInMaskForm = AndBIT;
1242 case Instruction::Or:
1243 OpcodeInMaskForm = OrBIT;
1245 case Instruction::Xor:
1246 OpcodeInMaskForm = XorBIT;
1249 return MainOp.equal(Opcode) ||
1250 (initializeAltOp(
I) && AltOp.equal(Opcode));
1252 MaskType InterchangeableMask = OpcodeInMaskForm;
1253 ConstantInt *CI = isBinOpWithConstantInt(
I).first;
1255 constexpr MaskType CanBeAll =
1256 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1257 const APInt &CIValue = CI->
getValue();
1259 case Instruction::Shl:
1261 InterchangeableMask = CIValue.
isZero() ? CanBeAll : MulBIT | ShlBIT;
1262 if (CIValue.
isOne())
1263 InterchangeableMask |= AddBIT;
1265 case Instruction::Mul:
1266 if (CIValue.
isOne()) {
1267 InterchangeableMask = CanBeAll;
1271 InterchangeableMask = MulBIT | ShlBIT;
1273 case Instruction::Add:
1274 case Instruction::Sub:
1275 InterchangeableMask = CIValue.
isZero() ? CanBeAll : SubBIT | AddBIT;
1277 case Instruction::And:
1279 InterchangeableMask = CanBeAll;
1281 case Instruction::Xor:
1283 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1287 InterchangeableMask = CanBeAll;
1291 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1292 (initializeAltOp(
I) &&
1293 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1295 unsigned getMainOpcode()
const {
return MainOp.getOpcode(); }
1296 bool hasDefinedMainOpcode()
const {
return MainOp.hasDefinedOpcode(); }
1298 bool hasCandidateOpcode(
unsigned Opcode)
const {
1299 return MainOp.hasCandidateOpcode(Opcode);
1301 bool hasAltOp()
const {
return AltOp.I; }
1302 unsigned getAltOpcode()
const {
1303 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1305 bool hasDefinedAltOpcode()
const {
1306 return !hasAltOp() || AltOp.hasDefinedOpcode();
1309 return MainOp.getOperand(
I);
1314class InstructionsState {
1340 bool HasCopyables =
false;
1344 assert(valid() &&
"InstructionsState is invalid.");
1349 assert(valid() &&
"InstructionsState is invalid.");
1354 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
1356 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
1359 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
1368 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
1369 assert(MainOp &&
"MainOp cannot be nullptr.");
1370 if (
I->getOpcode() == MainOp->getOpcode())
1372 if (MainOp->getOpcode() == Instruction::Select &&
1373 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1376 assert(AltOp &&
"AltOp cannot be nullptr.");
1377 if (
I->getOpcode() == AltOp->getOpcode())
1379 if (!
I->isBinaryOp())
1381 BinOpSameOpcodeHelper
Converter(MainOp);
1384 if (isAltShuffle() && !
Converter.hasCandidateOpcode(MainOp->getOpcode())) {
1385 BinOpSameOpcodeHelper AltConverter(AltOp);
1386 if (AltConverter.add(
I) && AltConverter.add(AltOp) &&
1387 AltConverter.hasCandidateOpcode(AltOp->getOpcode()))
1390 if (
Converter.hasAltOp() && !isAltShuffle())
1392 return Converter.hasAltOp() ? AltOp : MainOp;
1396 bool isShiftOp()
const {
1397 return getMainOp()->isShift() && getAltOp()->isShift();
1402 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1406 bool isMulDivLikeOp()
const {
1407 constexpr std::array<unsigned, 8> MulDiv = {
1408 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1409 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1410 Instruction::URem, Instruction::FRem};
1416 bool isAddSubLikeOp()
const {
1417 constexpr std::array<unsigned, 4>
AddSub = {
1418 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1425 bool isCmpOp()
const {
1426 return (getOpcode() == Instruction::ICmp ||
1427 getOpcode() == Instruction::FCmp) &&
1428 getAltOpcode() == getOpcode();
1432 bool valid()
const {
return MainOp && AltOp; }
1434 explicit operator bool()
const {
return valid(); }
1436 InstructionsState() =
delete;
1437 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1438 bool HasCopyables =
false)
1439 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1440 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
1443 bool isCopyableElement(
Value *V)
const {
1444 assert(valid() &&
"InstructionsState is invalid.");
1447 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1452 if (
I->getParent() != MainOp->getParent() &&
1456 if (
I->getOpcode() == MainOp->getOpcode())
1458 if (!
I->isBinaryOp())
1460 BinOpSameOpcodeHelper
Converter(MainOp);
1467 bool isExpandedBinOp(
Value *V)
const {
1468 assert(valid() &&
"InstructionsState is invalid.");
1469 if (isCopyableElement(V))
1474 auto CheckForTransformedOpcode = [](
const Instruction *RefOp,
1477 case Instruction::Add:
1478 switch (ExpandingOp->getOpcode()) {
1479 case Instruction::Shl:
1490 Instruction *MainOp = getMatchingMainOpOrAltOp(ExpandingOp);
1492 "The instruction should be compatible with either main or alt op.");
1493 return CheckForTransformedOpcode(MainOp, ExpandingOp);
1498 bool isExpandedOperand(Instruction *
I,
unsigned Idx)
const {
1499 assert(isExpandedBinOp(
I) &&
"Expected an expanded binop.");
1500 switch (
I->getOpcode()) {
1501 case Instruction::Shl:
1510 bool isNonSchedulable(
Value *V)
const {
1511 assert(valid() &&
"InstructionsState is invalid.");
1518 if (getMainOp() == V)
1520 if (isCopyableElement(V)) {
1521 auto IsNonSchedulableCopyableElement = [
this](
Value *
V) {
1523 return !
I ||
isa<PHINode>(
I) ||
I->getParent() != MainOp->getParent() ||
1528 !MainOp->comesBefore(
I));
1531 return IsNonSchedulableCopyableElement(V);
1538 bool areInstructionsWithCopyableElements()
const {
1539 assert(valid() &&
"InstructionsState is invalid.");
1540 return HasCopyables;
1544std::pair<Instruction *, SmallVector<Value *>>
1546 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(
I);
1547 assert(SelectedOp &&
"Cannot convert the instruction.");
1548 if (
I->isBinaryOp()) {
1550 return std::make_pair(SelectedOp,
Converter.getOperand(SelectedOp));
1569 for (
Value *V : VL) {
1574 if (Inst->getOpcode() == Opcode)
1588 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1599 "Assessing comparisons of different types?");
1609 return (BasePred == Pred &&
1611 (BasePred == SwappedPred &&
1622 return InstructionsState::invalid();
1626 return InstructionsState::invalid();
1631 (VL.
size() == 2 && InstCnt < 2))
1632 return InstructionsState::invalid();
1641 unsigned AltOpcode = Opcode;
1643 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1644 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1646 UniquePreds.
insert(BasePred);
1647 UniqueNonSwappedPreds.
insert(BasePred);
1648 for (
Value *V : VL) {
1655 UniqueNonSwappedPreds.
insert(CurrentPred);
1656 if (!UniquePreds.
contains(CurrentPred) &&
1657 !UniquePreds.
contains(SwappedCurrentPred))
1658 UniquePreds.
insert(CurrentPred);
1663 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
1673 return InstructionsState::invalid();
1675 bool AnyPoison = InstCnt != VL.
size();
1686 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
1687 return InstructionsState::invalid();
1688 unsigned InstOpcode =
I->getOpcode();
1690 if (BinOpHelper.add(
I))
1695 Value *Op1 =
I->getOperand(0);
1698 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1700 if (Opcode == AltOpcode) {
1703 "Cast isn't safe for alternation, logic needs to be updated!");
1704 AltOpcode = InstOpcode;
1711 Type *Ty0 = BaseInst->getOperand(0)->getType();
1712 Type *Ty1 = Inst->getOperand(0)->getType();
1714 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1715 assert(InstOpcode == AltOpcode &&
1716 "Alternate instructions are only supported by BinaryOperator "
1724 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1725 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1731 if (MainOp != AltOp) {
1734 }
else if (BasePred != CurrentPred) {
1737 "CmpInst isn't safe for alternation, logic needs to be updated!");
1742 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1743 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1746 }
else if (InstOpcode == Opcode) {
1747 assert(InstOpcode == AltOpcode &&
1748 "Alternate instructions are only supported by BinaryOperator and "
1751 if (Gep->getNumOperands() != 2 ||
1753 return InstructionsState::invalid();
1756 return InstructionsState::invalid();
1759 if (!LI->isSimple() || !BaseLI->isSimple())
1760 return InstructionsState::invalid();
1764 return InstructionsState::invalid();
1765 if (
Call->hasOperandBundles() &&
1767 !std::equal(
Call->op_begin() +
Call->getBundleOperandsStartIndex(),
1768 Call->op_begin() +
Call->getBundleOperandsEndIndex(),
1771 return InstructionsState::invalid();
1774 return InstructionsState::invalid();
1777 if (Mappings.
size() != BaseMappings.
size() ||
1778 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1779 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1780 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1781 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1782 Mappings.
front().Shape.Parameters !=
1783 BaseMappings.
front().Shape.Parameters)
1784 return InstructionsState::invalid();
1789 return InstructionsState::invalid();
1793 if (!BinOpHelper.hasDefinedMainOpcode() ||
1794 !BinOpHelper.hasDefinedAltOpcode())
1795 return InstructionsState::invalid();
1797 assert(MainOp &&
"Cannot find MainOp with Opcode from BinOpHelper.");
1799 assert(AltOp &&
"Cannot find AltOp with Opcode from BinOpHelper.");
1802 "Incorrect implementation of allSameOpcode.");
1803 InstructionsState S(MainOp, AltOp);
1809 "Invalid InstructionsState.");
1817 return all_of(VL, [&](
Value *V) {
return V->getType() == Ty; });
1827 unsigned Opcode = UserInst->
getOpcode();
1829 case Instruction::Load: {
1833 case Instruction::Store: {
1835 return (
SI->getPointerOperand() == Scalar);
1837 case Instruction::Call: {
1841 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1842 Arg.value().get() == Scalar;
1862 return LI->isSimple();
1864 return SI->isSimple();
1866 return !
MI->isVolatile();
1874 bool ExtendingManyInputs =
false) {
1875 if (SubMask.
empty())
1878 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1881 "SubMask with many inputs support must be larger than the mask.");
1883 Mask.append(SubMask.
begin(), SubMask.
end());
1887 int TermValue = std::min(Mask.size(), SubMask.
size());
1888 for (
int I = 0,
E = SubMask.
size();
I <
E; ++
I) {
1890 (!ExtendingManyInputs &&
1891 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1893 NewMask[
I] = Mask[SubMask[
I]];
1909 const size_t Sz = Order.
size();
1912 for (
unsigned I = 0;
I < Sz; ++
I) {
1914 UnusedIndices.
reset(Order[
I]);
1916 MaskedIndices.
set(
I);
1918 if (MaskedIndices.
none())
1921 "Non-synced masked/available indices.");
1925 assert(Idx >= 0 &&
"Indices must be synced.");
1935 unsigned Opcode0,
unsigned Opcode1) {
1942 OpcodeMask.
set(Lane * ScalarTyNumElements,
1943 Lane * ScalarTyNumElements + ScalarTyNumElements);
1952 "Expected scalar constants.");
1955 std::fill_n(NewVal.begin() +
I * VF, VF, V);
1962 const unsigned E = Indices.
size();
1964 for (
unsigned I = 0;
I <
E; ++
I)
1965 Mask[Indices[
I]] =
I;
1971 assert(!Mask.empty() &&
"Expected non-empty mask.");
1975 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
1977 Scalars[Mask[
I]] = Prev[
I];
1990 auto *IO = dyn_cast<Instruction>(V);
1993 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
2006 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
2008 auto *IU = dyn_cast<Instruction>(U);
2011 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
2027 return !VL.
empty() &&
2043 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
2053 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
2054 unsigned NumParts =
TTI.getNumberOfParts(VecTy);
2055 if (NumParts == 0 || NumParts >= Limit)
2061 if (NumParts >= Sz || PWSz % NumParts != 0 ||
2062 (PWSz / NumParts) % ScalarSz != 0 ||
2065 const unsigned NumElts = PWSz / NumParts;
2074 class ScheduleEntity;
2076 class ScheduleCopyableData;
2077 class ScheduleBundle;
2114 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
2115 AC(AC), DB(DB), DL(DL), ORE(ORE),
2134 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2147 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2148 VectorValuesAndScales = {});
2169 const SmallDenseSet<Value *> &UserIgnoreLst);
2176 assert(!VectorizableTree.empty() &&
"No graph to get the first node from");
2177 return VectorizableTree.front()->Scalars;
2183 const TreeEntry &Root = *VectorizableTree.front();
2184 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2185 !Root.Scalars.
front()->getType()->isIntegerTy())
2186 return std::nullopt;
2187 auto It = MinBWs.find(&Root);
2188 if (It != MinBWs.end())
2192 if (Root.getOpcode() == Instruction::ZExt ||
2193 Root.getOpcode() == Instruction::SExt)
2194 return std::make_pair(
cast<CastInst>(Root.getMainOp())->getSrcTy(),
2195 Root.getOpcode() == Instruction::SExt);
2196 return std::nullopt;
2202 return MinBWs.at(VectorizableTree.front().get()).second;
2207 if (ReductionBitWidth == 0 ||
2208 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2209 ReductionBitWidth >=
2210 DL->getTypeSizeInBits(
2211 VectorizableTree.front()->Scalars.front()->getType()))
2213 VectorizableTree.front()->Scalars.front()->getType(),
2214 VectorizableTree.front()->getVectorFactor());
2217 VectorizableTree.front()->Scalars.front()->getContext(),
2219 VectorizableTree.front()->getVectorFactor());
2224 return VectorizableTree.front()->hasState() &&
2225 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2226 VectorizableTree.front()->CombinedOp ==
2227 TreeEntry::ReducedBitcastBSwap ||
2228 VectorizableTree.front()->CombinedOp ==
2229 TreeEntry::ReducedBitcastLoads ||
2230 VectorizableTree.front()->CombinedOp ==
2231 TreeEntry::ReducedBitcastBSwapLoads) &&
2232 VectorizableTree.front()->State == TreeEntry::Vectorize;
2237 return VectorizableTree.front()->hasState() &&
2238 VectorizableTree.front()->CombinedOp ==
2239 TreeEntry::ReducedCmpBitcast &&
2240 VectorizableTree.front()->State == TreeEntry::Vectorize;
2258 VectorizableTree.clear();
2259 ScalarToTreeEntries.clear();
2260 DeletedNodes.clear();
2261 TransformedToGatherNodes.clear();
2262 OperandsToTreeEntry.clear();
2263 ScalarsInSplitNodes.clear();
2265 NonScheduledFirst.clear();
2266 EntryToLastInstruction.clear();
2267 LastInstructionToPos.clear();
2268 LoadEntriesToVectorize.clear();
2269 IsGraphTransformMode =
false;
2270 GatheredLoadsEntriesFirst.reset();
2271 CompressEntryToData.clear();
2272 ExternalUses.clear();
2273 ExternalUsesAsOriginalScalar.clear();
2274 ExternalUsesWithNonUsers.clear();
2275 for (
auto &Iter : BlocksSchedules) {
2276 BlockScheduling *BS = Iter.second.get();
2280 ReductionBitWidth = 0;
2282 CastMaxMinBWSizes.reset();
2283 ExtraBitWidthNodes.clear();
2284 InstrElementSize.clear();
2285 UserIgnoreList =
nullptr;
2286 PostponedGathers.clear();
2287 ValueToGatherNodes.clear();
2288 TreeEntryToStridedPtrInfoMap.clear();
2289 CurrentLoopNest.clear();
2290 MergedLoopBTCs.clear();
2306 assert(!Order.
empty() &&
"expected non-empty order");
2307 const unsigned Sz = Order.
size();
2309 return P.value() ==
P.index() ||
P.value() == Sz;
2322 bool IgnoreReorder);
2335 std::optional<OrdersType>
2373 return MaxVecRegSize;
2378 return MinVecRegSize;
2386 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
2387 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2388 return MaxVF ? MaxVF : UINT_MAX;
2410 Align Alignment,
const int64_t Diff,
2411 const size_t Sz)
const;
2451 Value *Ptr0, StridedPtrInfo &SPtrInfo)
const;
2470 Align CommonAlignment,
2472 StridedPtrInfo &SPtrInfo,
bool IsLoad)
const;
2487 StridedPtrInfo &SPtrInfo,
2488 unsigned *BestVF =
nullptr,
2489 bool TryRecursiveCheck =
true)
const;
2495 auto IsSame = [&](
const TreeEntry *TE) {
return TE->isSame(VL); };
2501 return any_of(getTreeEntries(S.getMainOp()), IsSame) ||
2502 any_of(ValueToGatherNodes.lookup(S.getMainOp()), IsSame);
2509 for (
Value *V : VL) {
2513 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(V)) {
2514 if (!Visited.
insert(TE).second)
2525 ListOfKnonwnNonVectorizableLoads.insert(
hash_value(VL));
2529 template <
typename T>
2531 return ListOfKnonwnNonVectorizableLoads.contains(
hash_value(VL));
2556 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
2557 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
2582 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2583 MaxLevel(MaxLevel) {}
2639 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
2644 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
2646 return U == U1 || U == U2 || R.isVectorized(U);
2649 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2652 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
2654 ((
int)V1->getNumUses() == NumLanes ||
2655 AllUsersAreInternal(V1, V2)))
2661 auto CheckSameEntryOrFail = [&]() {
2666 any_of(TEs2, [&](TreeEntry *E) {
return Set.contains(E); }))
2675 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2677 return CheckSameEntryOrFail();
2680 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
2681 LI2->getPointerOperand(), DL, SE,
true);
2682 if (!Dist || *Dist == 0) {
2685 R.TTI->isLegalMaskedGather(
2688 return CheckSameEntryOrFail();
2692 if (std::abs(*Dist) > NumLanes / 2)
2725 Value *EV2 =
nullptr;
2738 int Dist = Idx2 - Idx1;
2741 if (std::abs(Dist) == 0)
2743 if (std::abs(Dist) > NumLanes / 2)
2750 return CheckSameEntryOrFail();
2756 if (I1->getParent() != I2->getParent())
2757 return CheckSameEntryOrFail();
2766 V->getType() ==
Cond->getType()) ||
2769 V->getType() ==
Cond->getType()))
2778 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
2779 !S.isAltShuffle()) &&
2783 S.getMainOp()->getNumOperands();
2795 return CheckSameEntryOrFail();
2829 int ShallowScoreAtThisLevel =
2840 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2843 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2845 ShallowScoreAtThisLevel))
2846 return ShallowScoreAtThisLevel;
2847 assert(I1 && I2 &&
"Should have early exited.");
2854 if (I1->getNumOperands() != I2->getNumOperands())
2856 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2857 OpIdx1 != NumOperands1; ++OpIdx1) {
2859 int MaxTmpScore = 0;
2860 unsigned MaxOpIdx2 = 0;
2861 bool FoundBest =
false;
2865 ? I2->getNumOperands()
2866 : std::min(I2->getNumOperands(), OpIdx1 + 1);
2867 assert(FromIdx <= ToIdx &&
"Bad index");
2868 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2870 if (Op2Used.
count(OpIdx2))
2875 I1, I2, CurrLevel + 1, {});
2878 TmpScore > MaxTmpScore) {
2879 MaxTmpScore = TmpScore;
2886 Op2Used.
insert(MaxOpIdx2);
2887 ShallowScoreAtThisLevel += MaxTmpScore;
2890 return ShallowScoreAtThisLevel;
2921 struct OperandData {
2922 OperandData() =
default;
2923 OperandData(
Value *V,
bool APO,
bool IsUsed)
2924 : V(V), APO(APO), IsUsed(IsUsed) {}
2934 bool IsUsed =
false;
2943 enum class ReorderingMode {
2957 unsigned ArgSize = 0;
2963 const Loop *L =
nullptr;
2966 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2967 return OpsVec[
OpIdx][Lane];
2971 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2972 return OpsVec[
OpIdx][Lane];
2977 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2979 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2981 OpsVec[
OpIdx][Lane].IsUsed =
false;
2985 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2986 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2998 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
3000 Value *IdxLaneV = getData(Idx, Lane).V;
3013 unsigned UniquesCount = Uniques.
size();
3014 auto IdxIt = Uniques.
find(IdxLaneV);
3015 unsigned UniquesCntWithIdxLaneV =
3016 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
3018 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
3019 unsigned UniquesCntWithOpIdxLaneV =
3020 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
3021 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
3023 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
3024 UniquesCntWithOpIdxLaneV,
3025 UniquesCntWithOpIdxLaneV -
3027 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
3028 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
3029 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
3038 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
3039 Value *IdxLaneV = getData(Idx, Lane).V;
3052 return R.areAllUsersVectorized(IdxLaneI)
3060 static const int ScoreScaleFactor = 10;
3068 int Lane,
unsigned OpIdx,
unsigned Idx,
3078 int SplatScore = getSplatScore(Lane,
OpIdx, Idx, UsedLanes);
3079 if (Score <= -SplatScore) {
3083 Score += SplatScore;
3089 Score *= ScoreScaleFactor;
3090 Score += getExternalUseScore(Lane,
OpIdx, Idx);
3108 std::optional<unsigned>
3109 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
3113 unsigned NumOperands = getNumOperands();
3116 Value *OpLastLane = getData(
OpIdx, LastLane).V;
3119 ReorderingMode RMode = ReorderingModes[
OpIdx];
3120 if (RMode == ReorderingMode::Failed)
3121 return std::nullopt;
3124 bool OpIdxAPO = getData(
OpIdx, Lane).APO;
3130 std::optional<unsigned> Idx;
3134 BestScoresPerLanes.try_emplace(std::make_pair(
OpIdx, Lane), 0)
3140 bool IsUsed = RMode == ReorderingMode::Splat ||
3141 RMode == ReorderingMode::Constant ||
3142 RMode == ReorderingMode::Load;
3144 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
3146 OperandData &OpData = getData(Idx, Lane);
3148 bool OpAPO = OpData.APO;
3157 if (OpAPO != OpIdxAPO)
3162 case ReorderingMode::Load:
3163 case ReorderingMode::Opcode: {
3164 bool LeftToRight = Lane > LastLane;
3165 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
3166 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
3167 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
3168 OpIdx, Idx, IsUsed, UsedLanes);
3169 if (Score >
static_cast<int>(BestOp.Score) ||
3170 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
3173 BestOp.Score = Score;
3174 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] = Score;
3178 case ReorderingMode::Constant:
3180 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
3184 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3191 case ReorderingMode::Splat:
3193 IsUsed =
Op == OpLastLane;
3194 if (
Op == OpLastLane) {
3196 BestScoresPerLanes[std::make_pair(
OpIdx, Lane)] =
3202 case ReorderingMode::Failed:
3208 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
3212 return std::nullopt;
3219 unsigned getBestLaneToStartReordering()
const {
3220 unsigned Min = UINT_MAX;
3221 unsigned SameOpNumber = 0;
3232 for (
int I = getNumLanes();
I > 0; --
I) {
3233 unsigned Lane =
I - 1;
3234 OperandsOrderData NumFreeOpsHash =
3235 getMaxNumOperandsThatCanBeReordered(Lane);
3238 if (NumFreeOpsHash.NumOfAPOs < Min) {
3239 Min = NumFreeOpsHash.NumOfAPOs;
3240 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3242 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3243 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3244 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3247 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3248 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
3249 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
3250 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3251 auto [It, Inserted] =
3252 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
3258 unsigned BestLane = 0;
3259 unsigned CntMin = UINT_MAX;
3261 if (
Data.second.first < CntMin) {
3262 CntMin =
Data.second.first;
3263 BestLane =
Data.second.second;
3270 struct OperandsOrderData {
3273 unsigned NumOfAPOs = UINT_MAX;
3276 unsigned NumOpsWithSameOpcodeParent = 0;
3290 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
3291 unsigned CntTrue = 0;
3292 unsigned NumOperands = getNumOperands();
3302 bool AllUndefs =
true;
3303 unsigned NumOpsWithSameOpcodeParent = 0;
3308 const OperandData &OpData = getData(
OpIdx, Lane);
3315 I->getParent() != Parent) {
3316 if (NumOpsWithSameOpcodeParent == 0) {
3317 NumOpsWithSameOpcodeParent = 1;
3319 Parent =
I->getParent();
3321 --NumOpsWithSameOpcodeParent;
3324 ++NumOpsWithSameOpcodeParent;
3333 OperandsOrderData
Data;
3334 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
3335 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3342 const InstructionsState &S) {
3346 return VL.
size() == getNumLanes();
3348 "Expected same number of lanes");
3349 assert(S.valid() &&
"InstructionsState is invalid.");
3355 OpsVec.resize(ArgSize);
3356 unsigned NumLanes = VL.
size();
3357 for (OperandDataVec &
Ops : OpsVec)
3358 Ops.resize(NumLanes);
3373 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane],
true,
false};
3376 bool IsInverseOperation =
false;
3377 if (S.isCopyableElement(VL[Lane])) {
3379 IsInverseOperation =
3382 assert(
I &&
"Expected instruction");
3383 auto [SelectedOp,
Ops] = convertTo(
I, S);
3390 bool APO = (
OpIdx == 0) ?
false : IsInverseOperation;
3391 OpsVec[
OpIdx][Lane] = {Operands[
OpIdx][Lane], APO,
false};
3397 unsigned getNumOperands()
const {
return ArgSize; }
3400 unsigned getNumLanes()
const {
return OpsVec[0].size(); }
3403 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
3404 return getData(
OpIdx, Lane).V;
3408 bool empty()
const {
return OpsVec.empty(); }
3411 void clear() { OpsVec.clear(); }
3416 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
3418 "Op is expected to be getValue(OpIdx, Lane).");
3422 bool OpAPO = getData(
OpIdx, Lane).APO;
3423 bool IsInvariant = L && L->isLoopInvariant(
Op);
3425 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3429 bool FoundCandidate =
false;
3430 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3431 OperandData &
Data = getData(OpI, Ln);
3432 if (
Data.APO != OpAPO ||
Data.IsUsed)
3434 Value *OpILane = getValue(OpI, Lane);
3458 L->isLoopInvariant(
Data.V))) {
3459 FoundCandidate =
true;
3466 if (!FoundCandidate)
3469 return getNumLanes() == 2 || Cnt > 1;
3476 "Op is expected to be getValue(OpIdx, Lane).");
3477 bool OpAPO = getData(
OpIdx, Lane).APO;
3478 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3482 const OperandData &
Data = getData(OpI, Ln);
3483 if (
Data.APO != OpAPO ||
Data.IsUsed)
3485 Value *OpILn = getValue(OpI, Ln);
3486 return (L && L->isLoopInvariant(OpILn)) ||
3498 const InstructionsState &S,
const BoUpSLP &R)
3499 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3500 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
3502 appendOperands(RootVL, Operands, S);
3510 "Expected same num of lanes across all operands");
3511 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3512 OpVL[Lane] = OpsVec[
OpIdx][Lane].V;
3520 unsigned NumOperands = getNumOperands();
3521 unsigned NumLanes = getNumLanes();
3541 unsigned FirstLane = getBestLaneToStartReordering();
3550 if (shouldBroadcast(OpLane0,
OpIdx, FirstLane) ||
3551 !canBeVectorized(OpILane0,
OpIdx, FirstLane))
3552 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3554 ReorderingModes[
OpIdx] = ReorderingMode::Load;
3556 ReorderingModes[
OpIdx] = ReorderingMode::Opcode;
3558 ReorderingModes[
OpIdx] = ReorderingMode::Constant;
3561 ReorderingModes[
OpIdx] = ReorderingMode::Splat;
3570 auto &&SkipReordering = [
this]() {
3573 for (
const OperandData &
Data : Op0)
3576 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
3577 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
3582 return UniqueValues.
size() != 2;
3594 if (SkipReordering())
3597 bool StrategyFailed =
false;
3605 for (
unsigned I = 0;
I < NumOperands; ++
I)
3606 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
3609 UsedLanes.
set(FirstLane);
3610 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3612 for (
int Direction : {+1, -1}) {
3613 int Lane = FirstLane + Direction * Distance;
3614 if (Lane < 0 || Lane >= (
int)NumLanes)
3616 UsedLanes.
set(Lane);
3617 int LastLane = Lane - Direction;
3618 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
3623 std::optional<unsigned> BestIdx =
3624 getBestOperand(
OpIdx, Lane, LastLane, ReorderingModes,
3625 MainAltOps[
OpIdx], UsedLanes);
3632 swap(
OpIdx, *BestIdx, Lane);
3635 StrategyFailed =
true;
3639 OperandData &AltOp = getData(
OpIdx, Lane);
3640 InstructionsState OpS =
3642 if (OpS && OpS.isAltShuffle())
3649 if (!StrategyFailed)
3654#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3657 case ReorderingMode::Load:
3659 case ReorderingMode::Opcode:
3661 case ReorderingMode::Constant:
3663 case ReorderingMode::Splat:
3665 case ReorderingMode::Failed:
3686 const unsigned Indent = 2;
3688 for (
const OperandDataVec &OpDataVec : OpsVec) {
3689 OS <<
"Operand " << Cnt++ <<
"\n";
3690 for (
const OperandData &OpData : OpDataVec) {
3691 OS.
indent(Indent) <<
"{";
3692 if (
Value *V = OpData.V)
3696 OS <<
", APO:" << OpData.APO <<
"}\n";
3713 std::pair<std::optional<int>,
int>
3718 int BestScore = Limit;
3719 std::optional<int> Index;
3720 for (
int I :
seq<int>(0, Candidates.size())) {
3722 Candidates[
I].second,
3725 if (Score > BestScore) {
3730 return std::make_pair(Index, BestScore);
3740 DeletedInstructions.insert(
I);
3745 template <
typename T>
3748 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3749 VectorValuesAndScales) {
3751 for (T *V : DeadVals) {
3756 for (T *V : DeadVals) {
3757 if (!V || !Processed.
insert(V).second)
3762 for (
Use &U :
I->operands()) {
3764 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
3766 (Entries.
empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
3767 return Entry->VectorizedValue == OpI;
3771 I->dropAllReferences();
3773 for (T *V : DeadVals) {
3775 if (!
I->getParent())
3780 cast<Instruction>(U.getUser()));
3782 "trying to erase instruction with users.");
3783 I->removeFromParent();
3787 while (!DeadInsts.
empty()) {
3790 if (!VI || !VI->getParent())
3793 "Live instruction found in dead worklist!");
3794 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
3801 for (
Use &OpU : VI->operands()) {
3802 Value *OpV = OpU.get();
3814 if (!DeletedInstructions.contains(OpI) &&
3815 (!OpI->getType()->isVectorTy() ||
3817 VectorValuesAndScales,
3818 [&](
const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3819 &V) {
return std::get<0>(V) == OpI; })) &&
3824 VI->removeFromParent();
3826 SE->forgetValue(VI);
3833 return AnalyzedReductionsRoots.count(
I);
3838 AnalyzedReductionsRoots.insert(
I);
3843 return AnalyzedReductionVals.contains(
hash_value(VL));
3848 AnalyzedReductionVals.insert(
hash_value(VL));
3852 AnalyzedReductionsRoots.clear();
3853 AnalyzedReductionVals.clear();
3854 AnalyzedMinBWVals.clear();
3862 return MustGather.contains(V);
3866 return NonScheduledFirst.contains(V);
3871 assert(V &&
"V cannot be nullptr.");
3873 return any_of(Entries, [&](
const TreeEntry *E) {
3874 return !DeletedNodes.contains(E) && !TransformedToGatherNodes.contains(E);
3885 const InstructionsState &LocalState,
3897 bool collectValuesToDemote(
3898 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
3901 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
3910 void buildReorderableOperands(
3918 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
3921 bool areAllUsersVectorized(
3926 unsigned getNumScalarInsts()
const;
3930 unsigned getNumVectorInsts()
const;
3938 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3939 TreeEntry *getOperandEntry(TreeEntry *E,
unsigned Idx) {
3940 return const_cast<TreeEntry *
>(
3941 getOperandEntry(
const_cast<const TreeEntry *
>(E), Idx));
3947 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3951 getCastContextHint(
const TreeEntry &TE)
const;
3958 uint64_t getScaleToLoopIterations(
const TreeEntry &TE,
3959 Value *Scalar =
nullptr,
3977 uint64_t getGatherNodeEffectiveScale(
const TreeEntry &TE);
3998 unsigned InterleaveFactor = 0);
4009 bool ResizeAllowed =
false)
const;
4016 Value *vectorizeOperand(TreeEntry *
E,
unsigned NodeIdx);
4021 template <
typename BVTy,
typename ResTy,
typename... Args>
4022 ResTy processBuildVector(
const TreeEntry *
E,
Type *ScalarTy, Args &...Params);
4027 Value *createBuildVector(
const TreeEntry *
E,
Type *ScalarTy);
4033 Instruction &getLastInstructionInBundle(
const TreeEntry *
E);
4040 std::optional<TargetTransformInfo::ShuffleKind>
4052 unsigned NumParts)
const;
4064 std::optional<TargetTransformInfo::ShuffleKind>
4065 isGatherShuffledSingleRegisterEntry(
4068 unsigned SliceSize);
4082 isGatherShuffledEntry(
4085 unsigned NumParts,
bool ForOrder =
false);
4091 Type *ScalarTy)
const;
4095 void setInsertPointAfterBundle(
const TreeEntry *
E);
4105 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
4110 void tryToVectorizeGatheredLoads(
4112 std::tuple<BasicBlock *, Value *, Type *>,
4120 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
4136 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
4140 void reorderGatherNode(TreeEntry &TE);
4147 bool matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
bool &IsBSwap,
4148 bool &ForLoads)
const;
4152 bool matchesInversedZExtSelect(
4153 const TreeEntry &SelectTE,
4159 bool matchesSelectOfBits(
const TreeEntry &SelectTE)
const;
4164 TreeEntry(VecTreeTy &Container) : Container(Container) {}
4167 SmallVector<int> getCommonMask()
const {
4168 if (State == TreeEntry::SplitVectorize)
4170 SmallVector<int>
Mask;
4177 SmallVector<int> getSplitMask()
const {
4178 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
4179 "Expected only split vectorize node.");
4180 unsigned CommonVF = std::max<unsigned>(
4181 CombinedEntriesWithIndices.back().second,
4182 Scalars.size() - CombinedEntriesWithIndices.back().second);
4183 const unsigned Scale =
getNumElements(Scalars.front()->getType());
4186 for (
auto [Idx,
I] :
enumerate(ReorderIndices)) {
4190 (Idx >= CombinedEntriesWithIndices.back().second
4191 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
4200 void reorderSplitNode(
unsigned Idx, ArrayRef<int> Mask,
4201 ArrayRef<int> MaskOrder);
4206 if (
Mask.size() != VL.
size() && VL.
size() == Scalars.size())
4207 return std::equal(VL.
begin(), VL.
end(), Scalars.begin());
4210 [Scalars](
Value *V,
int Idx) {
4211 return (isa<UndefValue>(V) &&
4212 Idx == PoisonMaskElem) ||
4213 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4216 if (!ReorderIndices.empty()) {
4220 SmallVector<int>
Mask;
4222 if (VL.
size() == Scalars.size())
4223 return IsSame(Scalars, Mask);
4224 if (VL.
size() == ReuseShuffleIndices.size()) {
4226 return IsSame(Scalars, Mask);
4230 return IsSame(Scalars, ReuseShuffleIndices);
4234 bool hasEqualOperands(
const TreeEntry &TE)
const {
4235 if (
TE.getNumOperands() != getNumOperands())
4237 SmallBitVector
Used(getNumOperands());
4238 for (
unsigned I = 0,
E = getNumOperands();
I <
E; ++
I) {
4239 unsigned PrevCount =
Used.count();
4240 for (
unsigned K = 0;
K <
E; ++
K) {
4243 if (getOperand(K) ==
TE.getOperand(
I)) {
4249 if (PrevCount ==
Used.count())
4258 unsigned getVectorFactor()
const {
4259 if (!ReuseShuffleIndices.empty())
4260 return ReuseShuffleIndices.size();
4261 return Scalars.size();
4265 bool isGather()
const {
return State == NeedToGather; }
4271 WeakTrackingVH VectorizedValue =
nullptr;
4292 enum CombinedOpcode {
4294 MinMax = Instruction::OtherOpsEnd + 1,
4297 ReducedBitcastBSwap,
4298 ReducedBitcastLoads,
4299 ReducedBitcastBSwapLoads,
4302 CombinedOpcode CombinedOp = NotCombinedOp;
4305 SmallVector<int, 4> ReuseShuffleIndices;
4308 SmallVector<unsigned, 4> ReorderIndices;
4316 VecTreeTy &Container;
4319 EdgeInfo UserTreeIndex;
4332 SmallVector<ValueList, 2> Operands;
4335 SmallPtrSet<const Value *, 4> CopyableElements;
4339 InstructionsState S = InstructionsState::invalid();
4342 unsigned InterleaveFactor = 0;
4345 bool DoesNotNeedToSchedule =
false;
4349 if (Operands.size() <
OpIdx + 1)
4350 Operands.resize(
OpIdx + 1);
4353 "Number of operands is greater than the number of scalars.");
4359 mutable SmallDenseMap<Value *, unsigned> ValueToLane;
4363 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
4365 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
4368 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule =
true; }
4371 bool doesNotNeedToSchedule()
const {
return DoesNotNeedToSchedule; }
4376 setOperand(
I, Operands[
I]);
4380 void reorderOperands(ArrayRef<int> Mask) {
4388 return Operands[
OpIdx];
4394 return Operands[
OpIdx];
4398 unsigned getNumOperands()
const {
return Operands.size(); }
4401 Value *getSingleOperand(
unsigned OpIdx)
const {
4404 return Operands[
OpIdx][0];
4408 bool isAltShuffle()
const {
return S.isAltShuffle(); }
4410 Instruction *getMatchingMainOpOrAltOp(Instruction *
I)
const {
4411 return S.getMatchingMainOpOrAltOp(
I);
4419 if (
I && getMatchingMainOpOrAltOp(
I))
4421 return S.getMainOp();
4424 void setOperations(
const InstructionsState &S) {
4425 assert(S &&
"InstructionsState is invalid.");
4429 Instruction *getMainOp()
const {
return S.getMainOp(); }
4431 Instruction *getAltOp()
const {
return S.getAltOp(); }
4434 unsigned getOpcode()
const {
return S.getOpcode(); }
4436 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
4438 bool hasState()
const {
return S.valid(); }
4441 void addCopyableElement(
Value *V) {
4442 assert(S.isCopyableElement(V) &&
"Not a copyable element.");
4443 CopyableElements.insert(V);
4447 bool isCopyableElement(
Value *V)
const {
4448 return CopyableElements.contains(V);
4453 bool isExpandedBinOp(
Value *V)
const {
4454 assert(hasState() &&
"InstructionsState is invalid.");
4455 if (isCopyableElement(V))
4457 return S.isExpandedBinOp(V);
4462 bool isExpandedOperand(Instruction *
I,
unsigned Idx)
const {
4463 assert(hasState() &&
"InstructionsState is invalid.");
4464 if (isCopyableElement(
I))
4466 if (!isExpandedBinOp(
I))
4468 return S.isExpandedOperand(
I, Idx);
4472 bool hasCopyableElements()
const {
return !CopyableElements.empty(); }
4475 const InstructionsState &getOperations()
const {
return S; }
4479 unsigned findLaneForValue(
Value *V)
const {
4480 auto Res = ValueToLane.try_emplace(V, getVectorFactor());
4482 return Res.first->second;
4483 unsigned &FoundLane = Res.first->getSecond();
4484 for (
auto *It =
find(Scalars, V), *End = Scalars.end(); It != End;
4485 std::advance(It, 1)) {
4488 FoundLane = std::distance(Scalars.begin(), It);
4489 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4490 if (!ReorderIndices.empty())
4491 FoundLane = ReorderIndices[FoundLane];
4492 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
4493 if (ReuseShuffleIndices.empty())
4495 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
4496 RIt != ReuseShuffleIndices.end()) {
4497 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
4501 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
4508 buildAltOpShuffleMask(
const function_ref<
bool(Instruction *)> IsAltOp,
4509 SmallVectorImpl<int> &Mask,
4510 SmallVectorImpl<Value *> *OpScalars =
nullptr,
4511 SmallVectorImpl<Value *> *AltScalars =
nullptr)
const;
4514 bool isNonPowOf2Vec()
const {
4516 return IsNonPowerOf2;
4519 Value *getOrdered(
unsigned Idx)
const {
4520 if (ReorderIndices.empty())
4521 return Scalars[Idx];
4522 SmallVector<int>
Mask;
4524 return Scalars[
Mask[Idx]];
4530 dbgs() << Idx <<
".\n";
4531 for (
unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4532 dbgs() <<
"Operand " << OpI <<
":\n";
4533 for (
const Value *V : Operands[OpI])
4536 dbgs() <<
"Scalars: \n";
4537 for (
Value *V : Scalars) {
4539 << ((S && S.isExpandedBinOp(V)) ?
" [[Expanded]]\n"
4542 dbgs() <<
"State: ";
4543 if (S && hasCopyableElements())
4544 dbgs() <<
"[[Copyable]] ";
4547 if (InterleaveFactor > 0) {
4548 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
4551 dbgs() <<
"Vectorize\n";
4554 case ScatterVectorize:
4555 dbgs() <<
"ScatterVectorize\n";
4557 case StridedVectorize:
4558 dbgs() <<
"StridedVectorize\n";
4560 case CompressVectorize:
4561 dbgs() <<
"CompressVectorize\n";
4564 dbgs() <<
"NeedToGather\n";
4566 case CombinedVectorize:
4567 dbgs() <<
"CombinedVectorize\n";
4569 case SplitVectorize:
4570 dbgs() <<
"SplitVectorize\n";
4574 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
4575 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
4577 dbgs() <<
"MainOp: NULL\n";
4578 dbgs() <<
"AltOp: NULL\n";
4580 dbgs() <<
"VectorizedValue: ";
4581 if (VectorizedValue)
4582 dbgs() << *VectorizedValue <<
"\n";
4585 dbgs() <<
"ReuseShuffleIndices: ";
4586 if (ReuseShuffleIndices.empty())
4589 for (
int ReuseIdx : ReuseShuffleIndices)
4590 dbgs() << ReuseIdx <<
", ";
4592 dbgs() <<
"ReorderIndices: ";
4593 for (
unsigned ReorderIdx : ReorderIndices)
4594 dbgs() << ReorderIdx <<
", ";
4596 dbgs() <<
"UserTreeIndex: ";
4598 dbgs() << UserTreeIndex;
4600 dbgs() <<
"<invalid>";
4602 if (!CombinedEntriesWithIndices.empty()) {
4603 dbgs() <<
"Combined entries: ";
4605 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
4616 StringRef Banner)
const {
4617 dbgs() <<
"SLP: " << Banner <<
":\n";
4619 dbgs() <<
"SLP: Costs:\n";
4620 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
4621 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
4622 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
4623 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4624 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
4630 const InstructionsState &S,
4632 ArrayRef<int> ReuseShuffleIndices = {}) {
4633 auto Invalid = ScheduleBundle::invalid();
4634 return newTreeEntry(VL,
Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4639 const InstructionsState &S,
4641 ArrayRef<int> ReuseShuffleIndices = {},
4642 ArrayRef<unsigned> ReorderIndices = {},
4643 unsigned InterleaveFactor = 0) {
4644 TreeEntry::EntryState EntryState =
4645 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4646 TreeEntry *
E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4647 ReuseShuffleIndices, ReorderIndices);
4648 if (
E && InterleaveFactor > 0)
4649 E->setInterleave(InterleaveFactor);
4654 TreeEntry::EntryState EntryState,
4655 ScheduleBundle &Bundle,
const InstructionsState &S,
4657 ArrayRef<int> ReuseShuffleIndices = {},
4658 ArrayRef<unsigned> ReorderIndices = {}) {
4659 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4660 EntryState == TreeEntry::SplitVectorize)) ||
4661 (Bundle && EntryState != TreeEntry::NeedToGather &&
4662 EntryState != TreeEntry::SplitVectorize)) &&
4663 "Need to vectorize gather entry?");
4665 if (GatheredLoadsEntriesFirst.has_value() &&
4666 EntryState == TreeEntry::NeedToGather && S &&
4667 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4668 !UserTreeIdx.UserTE)
4670 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
4671 TreeEntry *
Last = VectorizableTree.back().get();
4672 Last->Idx = VectorizableTree.size() - 1;
4673 Last->State = EntryState;
4674 if (UserTreeIdx.UserTE)
4675 OperandsToTreeEntry.try_emplace(
4676 std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx),
Last);
4677 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
4678 ReuseShuffleIndices.end());
4679 if (ReorderIndices.
empty()) {
4682 Last->setOperations(S);
4685 Last->Scalars.assign(VL.
size(),
nullptr);
4687 [VL](
unsigned Idx) ->
Value * {
4688 if (Idx >= VL.size())
4689 return UndefValue::get(VL.front()->getType());
4694 Last->setOperations(S);
4695 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
4697 if (EntryState == TreeEntry::SplitVectorize) {
4698 assert(S &&
"Split nodes must have operations.");
4699 Last->setOperations(S);
4700 SmallPtrSet<Value *, 4> Processed;
4701 for (
Value *V : VL) {
4705 auto It = ScalarsInSplitNodes.find(V);
4706 if (It == ScalarsInSplitNodes.end()) {
4707 ScalarsInSplitNodes.try_emplace(V).first->getSecond().push_back(
Last);
4708 (void)Processed.
insert(V);
4709 }
else if (Processed.
insert(V).second) {
4711 "Value already associated with the node.");
4712 It->getSecond().push_back(
Last);
4715 }
else if (!
Last->isGather()) {
4718 (!S.areInstructionsWithCopyableElements() &&
4720 all_of(VL, [&](
Value *V) {
return S.isNonSchedulable(V); }))
4721 Last->setDoesNotNeedToSchedule();
4722 SmallPtrSet<Value *, 4> Processed;
4723 for (
Value *V : VL) {
4726 if (S.isCopyableElement(V)) {
4727 Last->addCopyableElement(V);
4730 auto It = ScalarToTreeEntries.find(V);
4731 if (It == ScalarToTreeEntries.end()) {
4732 ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(
Last);
4733 (void)Processed.
insert(V);
4734 }
else if (Processed.
insert(V).second) {
4736 "Value already associated with the node.");
4737 It->getSecond().push_back(
Last);
4741 assert((!Bundle.getBundle().empty() ||
Last->doesNotNeedToSchedule()) &&
4742 "Bundle and VL out of sync");
4743 if (!Bundle.getBundle().empty()) {
4744#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4745 auto *BundleMember = Bundle.getBundle().begin();
4746 SmallPtrSet<Value *, 4> Processed;
4747 for (
Value *V : VL) {
4748 if (S.isNonSchedulable(V) || !Processed.
insert(V).second)
4752 assert(BundleMember == Bundle.getBundle().end() &&
4753 "Bundle and VL out of sync");
4755 Bundle.setTreeEntry(
Last);
4759 bool AllConstsOrCasts =
true;
4760 for (
Value *V : VL) {
4761 if (S && S.areInstructionsWithCopyableElements() &&
4762 S.isCopyableElement(V))
4763 Last->addCopyableElement(V);
4766 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
4767 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4768 !UserTreeIdx.UserTE->isGather())
4769 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
4772 if (AllConstsOrCasts)
4774 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
4775 MustGather.insert_range(VL);
4778 if (UserTreeIdx.UserTE)
4779 Last->UserTreeIndex = UserTreeIdx;
4785 TreeEntry::VecTreeTy VectorizableTree;
4790 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4791 VectorizableTree[
Id]->dump();
4792 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4793 dbgs() <<
"[[TRANSFORMED TO GATHER]]";
4794 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4795 dbgs() <<
"[[DELETED NODE]]";
4803 assert(V &&
"V cannot be nullptr.");
4804 auto It = ScalarToTreeEntries.find(V);
4805 if (It == ScalarToTreeEntries.end())
4807 return It->getSecond();
4812 assert(V &&
"V cannot be nullptr.");
4813 auto It = ScalarsInSplitNodes.find(V);
4814 if (It == ScalarsInSplitNodes.end())
4816 return It->getSecond();
4821 bool SameVF =
false)
const {
4822 assert(V &&
"V cannot be nullptr.");
4823 for (TreeEntry *TE : ScalarToTreeEntries.lookup(V))
4824 if ((!SameVF ||
TE->getVectorFactor() == VL.
size()) &&
TE->isSame(VL))
4831 class ScalarsVectorizationLegality {
4832 InstructionsState S;
4834 bool TryToFindDuplicates;
4835 bool TrySplitVectorize;
4838 ScalarsVectorizationLegality(InstructionsState S,
bool IsLegal,
4839 bool TryToFindDuplicates =
true,
4840 bool TrySplitVectorize =
false)
4841 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4842 TrySplitVectorize(TrySplitVectorize) {
4843 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4844 "Inconsistent state");
4846 const InstructionsState &getInstructionsState()
const {
return S; };
4847 bool isLegal()
const {
return IsLegal; }
4848 bool tryToFindDuplicates()
const {
return TryToFindDuplicates; }
4849 bool trySplitVectorize()
const {
return TrySplitVectorize; }
4854 ScalarsVectorizationLegality
4856 const EdgeInfo &UserTreeIdx)
const;
4860 TreeEntry::EntryState getScalarsVectorizationState(
4862 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
4866 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4869 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4873 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4876 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4877 OperandsToTreeEntry;
4880 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4883 SmallDenseMap<Value *, unsigned> InstrElementSize;
4897 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4901 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4906 SetVector<const TreeEntry *> PostponedGathers;
4908 using ValueToGatherNodesMap =
4909 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4910 ValueToGatherNodesMap ValueToGatherNodes;
4912 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
4917 SetVector<unsigned> LoadEntriesToVectorize;
4920 bool IsGraphTransformMode =
false;
4923 std::optional<unsigned> GatheredLoadsEntriesFirst;
4926 SmallDenseMap<
const TreeEntry *,
4927 std::tuple<SmallVector<int>,
VectorType *, unsigned,
bool>>
4928 CompressEntryToData;
4932 SmallVector<const Loop *> CurrentLoopNest;
4936 SmallVector<const SCEV *> MergedLoopBTCs;
4939 SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
4944 SmallDenseMap<const Loop *, uint64_t> LoopNestScaleCache;
4947 struct ExternalUser {
4948 ExternalUser(
Value *S, llvm::User *U,
const TreeEntry &E,
unsigned L)
4949 : Scalar(S), User(
U), E(E), Lane(
L) {}
4952 Value *Scalar =
nullptr;
4955 llvm::User *User =
nullptr;
4963 using UserList = SmallVector<ExternalUser, 16>;
4969 bool isAliased(
const MemoryLocation &Loc1, Instruction *Inst1,
4970 Instruction *Inst2) {
4973 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
4974 auto Res = AliasCache.try_emplace(
Key);
4976 return Res.first->second;
4977 bool Aliased =
isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
4979 Res.first->getSecond() = Aliased;
4983 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4987 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4992 BatchAAResults BatchAA;
4999 DenseSet<Instruction *> DeletedInstructions;
5002 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
5005 DenseSet<size_t> AnalyzedReductionVals;
5009 DenseSet<Value *> AnalyzedMinBWVals;
5015 UserList ExternalUses;
5019 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
5023 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
5026 SmallPtrSet<const Value *, 32> EphValues;
5030 SetVector<Instruction *> GatherShuffleExtractSeq;
5033 DenseSet<BasicBlock *> CSEBlocks;
5036 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
5043 class ScheduleEntity {
5044 friend class ScheduleBundle;
5045 friend class ScheduleData;
5046 friend class ScheduleCopyableData;
5049 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
5050 Kind getKind()
const {
return K; }
5051 ScheduleEntity(Kind K) : K(K) {}
5055 int SchedulingPriority = 0;
5058 bool IsScheduled =
false;
5060 const Kind K = Kind::ScheduleData;
5063 ScheduleEntity() =
delete;
5065 void setSchedulingPriority(
int Priority) { SchedulingPriority = Priority; }
5066 int getSchedulingPriority()
const {
return SchedulingPriority; }
5067 bool isReady()
const {
5069 return SD->isReady();
5071 return CD->isReady();
5077 bool hasValidDependencies()
const {
5079 return SD->hasValidDependencies();
5081 return CD->hasValidDependencies();
5085 int getUnscheduledDeps()
const {
5087 return SD->getUnscheduledDeps();
5089 return CD->getUnscheduledDeps();
5093 int incrementUnscheduledDeps(
int Incr) {
5095 return SD->incrementUnscheduledDeps(Incr);
5099 int getDependencies()
const {
5101 return SD->getDependencies();
5107 return SD->getInst();
5112 bool isScheduled()
const {
return IsScheduled; }
5113 void setScheduled(
bool Scheduled) { IsScheduled = Scheduled; }
5115 static bool classof(
const ScheduleEntity *) {
return true; }
5117#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5118 void dump(raw_ostream &OS)
const {
5120 return SD->dump(OS);
5122 return CD->dump(OS);
5133#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5135 const BoUpSLP::ScheduleEntity &SE) {
5145 class ScheduleData final :
public ScheduleEntity {
5149 enum { InvalidDeps = -1 };
5151 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
5152 static bool classof(
const ScheduleEntity *Entity) {
5153 return Entity->getKind() == Kind::ScheduleData;
5156 void init(
int BlockSchedulingRegionID, Instruction *
I) {
5157 NextLoadStore =
nullptr;
5158 IsScheduled =
false;
5159 SchedulingRegionID = BlockSchedulingRegionID;
5160 clearDependencies();
5166 if (hasValidDependencies()) {
5167 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5169 assert(UnscheduledDeps == Dependencies &&
"invariant");
5173 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5174 "unexpected scheduled state");
5181 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
5185 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5190 int incrementUnscheduledDeps(
int Incr) {
5191 assert(hasValidDependencies() &&
5192 "increment of unscheduled deps would be meaningless");
5193 UnscheduledDeps += Incr;
5194 assert(UnscheduledDeps >= 0 &&
5195 "Expected valid number of unscheduled deps");
5196 return UnscheduledDeps;
5201 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5204 void clearDependencies() {
5205 clearDirectDependencies();
5206 MemoryDependencies.clear();
5207 ControlDependencies.clear();
5214 void clearDirectDependencies() {
5215 Dependencies = InvalidDeps;
5216 resetUnscheduledDeps();
5217 IsScheduled =
false;
5221 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5223 int getDependencies()
const {
return Dependencies; }
5225 void initDependencies() { Dependencies = 0; }
5227 void incDependencies() { Dependencies++; }
5230 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5237 return MemoryDependencies;
5240 void addMemoryDependency(ScheduleData *Dep) {
5241 MemoryDependencies.push_back(Dep);
5245 return ControlDependencies;
5248 void addControlDependency(ScheduleData *Dep) {
5249 ControlDependencies.push_back(Dep);
5252 ScheduleData *getNextLoadStore()
const {
return NextLoadStore; }
5253 void setNextLoadStore(ScheduleData *
Next) { NextLoadStore =
Next; }
5255 void dump(raw_ostream &OS)
const { OS << *Inst; }
5267 ScheduleData *NextLoadStore =
nullptr;
5271 SmallVector<ScheduleData *> MemoryDependencies;
5277 SmallVector<ScheduleData *> ControlDependencies;
5281 int SchedulingRegionID = 0;
5287 int Dependencies = InvalidDeps;
5293 int UnscheduledDeps = InvalidDeps;
5298 const BoUpSLP::ScheduleData &SD) {
5304 class ScheduleBundle final :
public ScheduleEntity {
5308 bool IsValid =
true;
5310 TreeEntry *TE =
nullptr;
5311 ScheduleBundle(
bool IsValid)
5312 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5315 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5316 static bool classof(
const ScheduleEntity *Entity) {
5317 return Entity->getKind() == Kind::ScheduleBundle;
5322 for (
const ScheduleEntity *SD : Bundle) {
5323 if (SD->hasValidDependencies()) {
5324 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5327 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5331 if (isScheduled()) {
5332 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5333 "unexpected scheduled state");
5339 int unscheduledDepsInBundle()
const {
5340 assert(*
this &&
"bundle must not be empty");
5342 for (
const ScheduleEntity *BundleMember : Bundle) {
5343 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5344 return ScheduleData::InvalidDeps;
5345 Sum += BundleMember->getUnscheduledDeps();
5353 bool hasValidDependencies()
const {
5354 return all_of(Bundle, [](
const ScheduleEntity *SD) {
5355 return SD->hasValidDependencies();
5361 bool isReady()
const {
5362 assert(*
this &&
"bundle must not be empty");
5363 return unscheduledDepsInBundle() == 0 && !isScheduled();
5371 void add(ScheduleEntity *SD) { Bundle.push_back(SD); }
5374 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5375 TreeEntry *getTreeEntry()
const {
return TE; }
5377 static ScheduleBundle invalid() {
return {
false}; }
5379 operator bool()
const {
return IsValid; }
5382 void dump(raw_ostream &OS)
const {
5391 OS << *SD->getInst();
5405 const BoUpSLP::ScheduleBundle &Bundle) {
5416 class ScheduleCopyableData final :
public ScheduleEntity {
5423 int SchedulingRegionID = 0;
5425 ScheduleBundle &Bundle;
5428 ScheduleCopyableData(
int BlockSchedulingRegionID,
Instruction *
I,
5429 const EdgeInfo &EI, ScheduleBundle &Bundle)
5430 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(
I), EI(EI),
5431 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5432 static bool classof(
const ScheduleEntity *Entity) {
5433 return Entity->getKind() == Kind::ScheduleCopyableData;
5438 if (hasValidDependencies()) {
5439 assert(UnscheduledDeps <= Dependencies &&
"invariant");
5441 assert(UnscheduledDeps == Dependencies &&
"invariant");
5445 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5446 "unexpected scheduled state");
5453 bool hasValidDependencies()
const {
5454 return Dependencies != ScheduleData::InvalidDeps;
5459 bool isReady()
const {
return UnscheduledDeps == 0 && !IsScheduled; }
5464 int incrementUnscheduledDeps(
int Incr) {
5465 assert(hasValidDependencies() &&
5466 "increment of unscheduled deps would be meaningless");
5467 UnscheduledDeps += Incr;
5468 assert(UnscheduledDeps >= 0 &&
"invariant");
5469 return UnscheduledDeps;
5474 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5477 int getUnscheduledDeps()
const {
return UnscheduledDeps; }
5479 int getDependencies()
const {
return Dependencies; }
5481 void initDependencies() { Dependencies = 0; }
5483 void incDependencies() { Dependencies++; }
5486 int getSchedulingRegionID()
const {
return SchedulingRegionID; }
5492 void clearDependencies() {
5493 Dependencies = ScheduleData::InvalidDeps;
5494 UnscheduledDeps = ScheduleData::InvalidDeps;
5495 IsScheduled =
false;
5499 const EdgeInfo &getEdgeInfo()
const {
return EI; }
5502 ScheduleBundle &getBundle() {
return Bundle; }
5503 const ScheduleBundle &getBundle()
const {
return Bundle; }
5505#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5506 void dump(raw_ostream &OS)
const { OS <<
"[Copyable]" << *getInst(); }
5517 int Dependencies = ScheduleData::InvalidDeps;
5523 int UnscheduledDeps = ScheduleData::InvalidDeps;
5553 struct BlockScheduling {
5555 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
5558 ScheduledBundles.clear();
5559 ScheduledBundlesList.
clear();
5560 ScheduleCopyableDataMap.clear();
5561 ScheduleCopyableDataMapByInst.clear();
5562 ScheduleCopyableDataMapByInstUser.clear();
5563 ScheduleCopyableDataMapByUsers.clear();
5565 ScheduleStart =
nullptr;
5566 ScheduleEnd =
nullptr;
5567 FirstLoadStoreInRegion =
nullptr;
5568 LastLoadStoreInRegion =
nullptr;
5569 RegionHasStackSave =
false;
5573 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5576 ScheduleRegionSize = 0;
5580 ++SchedulingRegionID;
5583 ScheduleData *getScheduleData(Instruction *
I) {
5586 if (BB !=
I->getParent())
5589 ScheduleData *SD = ScheduleDataMap.lookup(
I);
5590 if (SD && isInSchedulingRegion(*SD))
5595 ScheduleData *getScheduleData(
Value *V) {
5601 ScheduleCopyableData *getScheduleCopyableData(
const EdgeInfo &EI,
5602 const Value *V)
const {
5603 if (ScheduleCopyableDataMap.empty())
5605 auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V));
5606 if (It == ScheduleCopyableDataMap.end())
5608 ScheduleCopyableData *SD = It->getSecond().get();
5609 if (!isInSchedulingRegion(*SD))
5617 getScheduleCopyableData(
const Value *User,
unsigned OperandIdx,
5619 if (ScheduleCopyableDataMapByInstUser.empty())
5621 const auto It = ScheduleCopyableDataMapByInstUser.find(
5622 std::make_pair(std::make_pair(User, OperandIdx), V));
5623 if (It == ScheduleCopyableDataMapByInstUser.end())
5626 for (ScheduleCopyableData *SD : It->getSecond()) {
5627 if (isInSchedulingRegion(*SD))
5641 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5645 if (ScheduleCopyableDataMap.empty())
5647 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5649 if (Entries.
empty())
5651 unsigned CurNumOps = 0;
5652 for (
const Use &U :
User->operands()) {
5658 for (TreeEntry *TE : Entries) {
5660 bool IsNonSchedulableWithParentPhiNode =
5661 TE->doesNotNeedToSchedule() &&
TE->UserTreeIndex &&
5662 TE->UserTreeIndex.UserTE->hasState() &&
5663 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5664 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5667 if (IsNonSchedulableWithParentPhiNode) {
5668 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5669 const TreeEntry *ParentTE =
TE->UserTreeIndex.UserTE;
5670 for (
Value *V : ParentTE->Scalars) {
5674 if (ParentsUniqueUsers.
insert(
PHI).second &&
5679 Inc =
count(
TE->Scalars, User);
5687 bool IsCommutativeUser =
5690 if (!IsCommutativeUser) {
5700 (!IsCommutativeUser ||
5709 "Expected commutative user with 2 first commutable operands");
5710 bool IsCommutativeWithSameOps =
5711 IsCommutativeUser &&
User->getOperand(0) ==
User->getOperand(1);
5712 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5714 EdgeInfo EI(TE,
U.getOperandNo());
5715 if (CurNumOps !=
NumOps || getScheduleCopyableData(EI,
Op))
5719 PotentiallyReorderedEntriesCount.
try_emplace(TE, 0)
5720 .first->getSecond() += Inc;
5723 if (PotentiallyReorderedEntriesCount.
empty())
5726 for (
auto &
P : PotentiallyReorderedEntriesCount) {
5727 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5728 bool IsNonSchedulableWithParentPhiNode =
5729 P.first->doesNotNeedToSchedule() &&
P.first->UserTreeIndex &&
5730 P.first->UserTreeIndex.UserTE->hasState() &&
5731 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5732 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5733 auto *It =
find(
P.first->Scalars, User);
5735 assert(It !=
P.first->Scalars.end() &&
5736 "User is not in the tree entry");
5737 int Lane = std::distance(
P.first->Scalars.begin(), It);
5738 assert(Lane >= 0 &&
"Lane is not found");
5740 Lane =
P.first->ReorderIndices[Lane];
5741 assert(Lane <
static_cast<int>(
P.first->Scalars.size()) &&
5742 "Couldn't find extract lane");
5745 if (IsNonSchedulableWithParentPhiNode) {
5746 const TreeEntry *ParentTE =
P.first->UserTreeIndex.UserTE;
5748 if (!ParentsUniqueUsers.
insert(User).second) {
5754 for (
unsigned OpIdx :
5756 P.first->getMainOp()))) {
5757 if (
P.first->getOperand(
OpIdx)[Lane] ==
Op &&
5758 getScheduleCopyableData(EdgeInfo(
P.first,
OpIdx),
Op))
5763 }
while (It !=
P.first->Scalars.end());
5765 return all_of(PotentiallyReorderedEntriesCount,
5766 [&](
const std::pair<const TreeEntry *, unsigned> &
P) {
5767 return P.second ==
NumOps - 1;
5772 getScheduleCopyableData(
const Instruction *
I)
const {
5773 if (ScheduleCopyableDataMapByInst.empty())
5775 const auto It = ScheduleCopyableDataMapByInst.find(
I);
5776 if (It == ScheduleCopyableDataMapByInst.end())
5779 for (ScheduleCopyableData *SD : It->getSecond()) {
5780 if (isInSchedulingRegion(*SD))
5787 getScheduleCopyableDataUsers(
const Instruction *User)
const {
5788 if (ScheduleCopyableDataMapByUsers.empty())
5790 const auto It = ScheduleCopyableDataMapByUsers.find(User);
5791 if (It == ScheduleCopyableDataMapByUsers.end())
5794 for (ScheduleCopyableData *SD : It->getSecond()) {
5795 if (isInSchedulingRegion(*SD))
5801 ScheduleCopyableData &addScheduleCopyableData(
const EdgeInfo &EI,
5803 int SchedulingRegionID,
5804 ScheduleBundle &Bundle) {
5805 assert(!getScheduleCopyableData(EI,
I) &&
"already in the map");
5806 ScheduleCopyableData *CD =
5807 ScheduleCopyableDataMap
5808 .try_emplace(std::make_pair(EI,
I),
5809 std::make_unique<ScheduleCopyableData>(
5810 SchedulingRegionID,
I, EI, Bundle))
5813 ScheduleCopyableDataMapByInst[
I].push_back(CD);
5817 assert(It !=
Op.end() &&
"Lane not set");
5818 SmallPtrSet<Instruction *, 4> Visited;
5820 int Lane = std::distance(
Op.begin(), It);
5821 assert(Lane >= 0 &&
"Lane not set");
5823 !EI.UserTE->ReorderIndices.empty())
5824 Lane = EI.UserTE->ReorderIndices[Lane];
5825 assert(Lane <
static_cast<int>(EI.UserTE->Scalars.size()) &&
5826 "Couldn't find extract lane");
5828 if (!Visited.
insert(In).second) {
5832 ScheduleCopyableDataMapByInstUser
5833 .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx),
I))
5836 ScheduleCopyableDataMapByUsers.try_emplace(
I)
5843 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5844 if (ScheduleCopyableData *UserCD =
5845 getScheduleCopyableData(UserEI, In))
5846 ScheduleCopyableDataMapByUsers[
I].remove(UserCD);
5849 }
while (It !=
Op.end());
5851 ScheduleCopyableDataMapByUsers.try_emplace(
I).first->getSecond().insert(
5861 auto It = ScheduledBundles.find(
I);
5862 if (It == ScheduledBundles.end())
5864 return It->getSecond();
5868 bool isInSchedulingRegion(
const ScheduleEntity &SD)
const {
5870 return Data->getSchedulingRegionID() == SchedulingRegionID;
5872 return CD->getSchedulingRegionID() == SchedulingRegionID;
5874 [&](
const ScheduleEntity *BundleMember) {
5875 return isInSchedulingRegion(*BundleMember);
5881 template <
typename ReadyListType>
5882 void schedule(
const BoUpSLP &R,
const InstructionsState &S,
5883 const EdgeInfo &EI, ScheduleEntity *
Data,
5884 ReadyListType &ReadyList) {
5885 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5890 auto DecrUnsched = [&](
auto *
Data,
bool IsControl =
false) {
5891 if ((IsControl ||
Data->hasValidDependencies()) &&
5892 Data->incrementUnscheduledDeps(-1) == 0) {
5899 CopyableBundle.
push_back(&CD->getBundle());
5900 Bundles = CopyableBundle;
5902 Bundles = getScheduleBundles(
Data->getInst());
5904 if (!Bundles.
empty()) {
5905 for (ScheduleBundle *Bundle : Bundles) {
5906 if (Bundle->unscheduledDepsInBundle() == 0) {
5907 assert(!Bundle->isScheduled() &&
5908 "already scheduled bundle gets ready");
5909 ReadyList.insert(Bundle);
5911 <<
"SLP: gets ready: " << *Bundle <<
"\n");
5917 "already scheduled bundle gets ready");
5919 "Expected non-copyable data");
5920 ReadyList.insert(
Data);
5927 if (!ScheduleCopyableDataMap.empty()) {
5929 getScheduleCopyableData(User,
OpIdx,
I);
5930 for (ScheduleCopyableData *CD : CopyableData)
5931 DecrUnsched(CD,
false);
5932 if (!CopyableData.empty())
5935 if (ScheduleData *OpSD = getScheduleData(
I))
5936 DecrUnsched(OpSD,
false);
5942 if (!Bundles.empty()) {
5943 auto *
In = BundleMember->getInst();
5945 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5946 unsigned TotalOpCount = 0;
5949 TotalOpCount = OperandsUses[
In] = 1;
5951 for (
const Use &U :
In->operands()) {
5954 unsigned ExtraDeps = 1;
5956 for (ScheduleBundle *Bundle : Bundles) {
5957 if (
const TreeEntry *TE = Bundle->getTreeEntry()) {
5958 if (
TE->isExpandedBinOp(In))
5960 }
else if (S.isExpandedBinOp(In)) {
5964 Res.first->getSecond() += ExtraDeps;
5965 TotalOpCount += ExtraDeps;
5971 auto DecrUnschedForInst =
5973 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5975 bool IsExpandedOperand =
false) {
5976 if (!ScheduleCopyableDataMap.empty()) {
5977 const EdgeInfo EI = {UserTE,
OpIdx};
5978 if (ScheduleCopyableData *CD =
5979 getScheduleCopyableData(EI,
I)) {
5980 if (!Checked.insert(std::make_pair(CD,
OpIdx)).second)
5982 DecrUnsched(CD,
false);
5986 auto It = OperandsUses.
find(
I);
5987 assert(It != OperandsUses.
end() &&
"Operand not found");
5988 if (It->second > 0) {
5989 if (ScheduleData *OpSD = getScheduleData(
I)) {
5990 if (!IsExpandedOperand &&
5991 !Checked.insert(std::make_pair(OpSD,
OpIdx)).second)
5994 assert(TotalOpCount > 0 &&
"No more operands to decrement");
5996 DecrUnsched(OpSD,
false);
5999 assert(TotalOpCount > 0 &&
"No more operands to decrement");
6005 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
6006 for (ScheduleBundle *Bundle : Bundles) {
6007 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
6009 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
6012 auto *It =
find(Bundle->getTreeEntry()->Scalars, In);
6013 bool IsNonSchedulableWithParentPhiNode =
6014 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
6015 Bundle->getTreeEntry()->UserTreeIndex &&
6016 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
6017 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
6018 TreeEntry::SplitVectorize &&
6019 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
6023 std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
6024 assert(Lane >= 0 &&
"Lane not set");
6026 !Bundle->getTreeEntry()->ReorderIndices.empty())
6027 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
6028 assert(Lane <
static_cast<int>(
6029 Bundle->getTreeEntry()->Scalars.size()) &&
6030 "Couldn't find extract lane");
6041 In->getNumOperands() ==
6042 Bundle->getTreeEntry()->getNumOperands() ||
6043 (
isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
6044 Instruction::Select) ||
6045 Bundle->getTreeEntry()->isCopyableElement(In)) &&
6046 "Missed TreeEntry operands?");
6050 if (IsNonSchedulableWithParentPhiNode) {
6051 const TreeEntry *ParentTE =
6052 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
6054 if (!ParentsUniqueUsers.
insert(User).second) {
6055 It = std::find(std::next(It),
6056 Bundle->getTreeEntry()->Scalars.end(), In);
6061 for (
unsigned OpIdx :
6064 Bundle->getTreeEntry()->getOperand(
OpIdx)[Lane])) {
6068 I, Bundle->getTreeEntry(),
OpIdx, Checked,
6069 Bundle->getTreeEntry()->isExpandedOperand(In,
OpIdx));
6072 if (Bundle->getTreeEntry()->isCopyableElement(In))
6074 It = std::find(std::next(It),
6075 Bundle->getTreeEntry()->Scalars.end(), In);
6076 }
while (It != Bundle->getTreeEntry()->Scalars.end());
6081 for (Use &U : BundleMember->getInst()->operands()) {
6084 <<
"SLP: check for readiness (def): " << *
I <<
"\n");
6085 DecrUnschedForInst(BundleMember->getInst(),
U.getOperandNo(),
I);
6093 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
6094 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
6095 if (!VisitedMemory.
insert(MemoryDep).second)
6100 << *MemoryDep <<
"\n");
6101 DecrUnsched(MemoryDep);
6104 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
6105 for (ScheduleData *Dep : SD->getControlDependencies()) {
6106 if (!VisitedControl.
insert(Dep).second)
6111 <<
"SLP: check for readiness (ctrl): " << *Dep <<
"\n");
6112 DecrUnsched(Dep,
true);
6116 SD->setScheduled(
true);
6122 if (!Entries.
empty()) {
6123 for (TreeEntry *TE : Entries) {
6125 In->getNumOperands() !=
TE->getNumOperands())
6128 PseudoBundles.
emplace_back(std::make_unique<ScheduleBundle>());
6129 BundlePtr->setTreeEntry(TE);
6134 ProcessBundleMember(SD, Bundles);
6137 Bundle.setScheduled(
true);
6139 auto AreAllBundlesScheduled =
6140 [&](
const ScheduleEntity *SD,
6144 return !SDBundles.empty() &&
6145 all_of(SDBundles, [&](
const ScheduleBundle *SDBundle) {
6146 return SDBundle->isScheduled();
6149 for (ScheduleEntity *SD : Bundle.getBundle()) {
6152 SDBundles = getScheduleBundles(SD->getInst());
6153 if (AreAllBundlesScheduled(SD, SDBundles)) {
6154 SD->setScheduled(
true);
6167 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
6168 ScheduleStart->comesBefore(ScheduleEnd) &&
6169 "Not a valid scheduling region?");
6171 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
6173 if (!Bundles.
empty()) {
6174 for (ScheduleBundle *Bundle : Bundles) {
6175 assert(isInSchedulingRegion(*Bundle) &&
6176 "primary schedule data not in window?");
6181 auto *SD = getScheduleData(
I);
6184 assert(isInSchedulingRegion(*SD) &&
6185 "primary schedule data not in window?");
6190 [](
const ScheduleEntity *Bundle) {
6191 return Bundle->isReady();
6193 "item in ready list not ready?");
6197 template <
typename ReadyListType>
6198 void initialFillReadyList(ReadyListType &ReadyList) {
6199 SmallPtrSet<ScheduleBundle *, 16> Visited;
6200 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
6201 ScheduleData *SD = getScheduleData(
I);
6202 if (SD && SD->hasValidDependencies() && SD->isReady()) {
6205 for (ScheduleBundle *Bundle : Bundles) {
6206 if (!Visited.
insert(Bundle).second)
6208 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
6209 ReadyList.insert(Bundle);
6211 << *Bundle <<
"\n");
6216 ReadyList.insert(SD);
6218 <<
"SLP: initially in ready list: " << *SD <<
"\n");
6229 const InstructionsState &S,
const EdgeInfo &EI);
6236 std::optional<ScheduleBundle *>
6238 const InstructionsState &S,
const EdgeInfo &EI);
6241 ScheduleData *allocateScheduleDataChunks();
6245 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
6249 void initScheduleData(Instruction *FromI, Instruction *ToI,
6250 ScheduleData *PrevLoadStore,
6251 ScheduleData *NextLoadStore);
6255 void calculateDependencies(ScheduleBundle &Bundle,
bool InsertInReadyList,
6257 const SmallPtrSetImpl<Value *> &ExpandedOps,
6261 void resetSchedule();
6278 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6282 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6283 std::unique_ptr<ScheduleCopyableData>>
6284 ScheduleCopyableDataMap;
6290 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6291 ScheduleCopyableDataMapByInst;
6297 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>,
const Value *>,
6299 ScheduleCopyableDataMapByInstUser;
6319 SmallSetVector<ScheduleCopyableData *, 4>>
6320 ScheduleCopyableDataMapByUsers;
6323 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6329 SetVector<ScheduleEntity *> ReadyInsts;
6339 ScheduleData *FirstLoadStoreInRegion =
nullptr;
6343 ScheduleData *LastLoadStoreInRegion =
nullptr;
6348 bool RegionHasStackSave =
false;
6351 int ScheduleRegionSize = 0;
6360 int SchedulingRegionID = 1;
6364 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6368 void scheduleBlock(
const BoUpSLP &R, BlockScheduling *BS);
6371 const SmallDenseSet<Value *> *UserIgnoreList =
nullptr;
6375 struct OrdersTypeDenseMapInfo {
6388 static unsigned getHashValue(
const OrdersType &V) {
6399 ScalarEvolution *SE;
6400 TargetTransformInfo *TTI;
6401 TargetLibraryInfo *TLI;
6404 AssumptionCache *AC;
6406 const DataLayout *DL;
6407 OptimizationRemarkEmitter *ORE;
6409 unsigned MaxVecRegSize;
6410 unsigned MinVecRegSize;
6413 IRBuilder<TargetFolder> Builder;
6420 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6425 unsigned ReductionBitWidth = 0;
6428 unsigned BaseGraphSize = 1;
6432 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6436 DenseSet<unsigned> ExtraBitWidthNodes;
6444 SecondInfo::getEmptyKey());
6449 SecondInfo::getTombstoneKey());
6454 SecondInfo::getHashValue(Val.
EdgeIdx));
6475 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6486 return R.VectorizableTree[0].get();
6490 return {&
N->UserTreeIndex,
N->Container};
6494 return {&
N->UserTreeIndex + 1,
N->Container};
6521 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
6533 OS << Entry->Idx <<
".\n";
6536 for (
auto *V : Entry->Scalars) {
6538 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
6539 return EU.Scalar == V;
6549 if (Entry->isGather())
6551 if (Entry->State == TreeEntry::ScatterVectorize ||
6552 Entry->State == TreeEntry::StridedVectorize ||
6553 Entry->State == TreeEntry::CompressVectorize)
6554 return "color=blue";
6561 for (
auto *
I : DeletedInstructions) {
6562 if (!
I->getParent()) {
6567 I->insertBefore(F->getEntryBlock(),
6568 F->getEntryBlock().getFirstNonPHIIt());
6570 I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());
6573 for (
Use &U :
I->operands()) {
6575 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
6579 I->dropAllReferences();
6581 for (
auto *
I : DeletedInstructions) {
6583 "trying to erase instruction with users.");
6584 I->eraseFromParent();
6590#ifdef EXPENSIVE_CHECKS
6601 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
6602 "Expected non-empty mask.");
6605 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
6607 Reuses[Mask[
I]] = Prev[
I];
6615 bool BottomOrder =
false) {
6616 assert(!Mask.empty() &&
"Expected non-empty mask.");
6617 unsigned Sz = Mask.size();
6620 if (Order.
empty()) {
6622 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
6624 PrevOrder.
swap(Order);
6627 for (
unsigned I = 0;
I < Sz; ++
I)
6629 Order[
I] = PrevOrder[Mask[
I]];
6631 return Data.value() == Sz ||
Data.index() ==
Data.value();
6640 if (Order.
empty()) {
6642 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
6652 for (
unsigned I = 0;
I < Sz; ++
I)
6654 Order[MaskOrder[
I]] =
I;
6658std::optional<BoUpSLP::OrdersType>
6660 bool TopToBottom,
bool IgnoreReorder) {
6661 assert(TE.isGather() &&
"Expected gather node only.");
6665 Type *ScalarTy = GatheredScalars.
front()->getType();
6666 size_t NumScalars = GatheredScalars.
size();
6668 return std::nullopt;
6675 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
6677 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
6680 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
6681 return std::nullopt;
6682 OrdersType CurrentOrder(NumScalars, NumScalars);
6683 if (GatherShuffles.
size() == 1 &&
6685 Entries.
front().front()->isSame(TE.Scalars)) {
6689 return std::nullopt;
6691 if (Entries.
front().front()->UserTreeIndex.UserTE ==
6692 TE.UserTreeIndex.UserTE)
6693 return std::nullopt;
6696 if (!IgnoreReorder && Entries.
front().front()->Idx == 0)
6697 return std::nullopt;
6700 if (!Entries.
front().front()->ReuseShuffleIndices.empty() &&
6701 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6704 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6706 return std::nullopt;
6710 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
6711 return CurrentOrder;
6715 return all_of(Mask, [&](
int I) {
6722 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
6723 (Entries.
size() != 1 ||
6724 Entries.
front().front()->ReorderIndices.empty())) ||
6725 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
6726 return std::nullopt;
6732 if (ShuffledSubMasks.
test(
I))
6734 const int VF = GetVF(
I);
6742 ShuffledSubMasks.
set(
I);
6746 int FirstMin = INT_MAX;
6747 int SecondVecFound =
false;
6749 int Idx = Mask[
I * PartSz + K];
6751 Value *V = GatheredScalars[
I * PartSz + K];
6753 SecondVecFound =
true;
6762 SecondVecFound =
true;
6766 FirstMin = (FirstMin / PartSz) * PartSz;
6768 if (SecondVecFound) {
6770 ShuffledSubMasks.
set(
I);
6774 int Idx = Mask[
I * PartSz + K];
6778 if (Idx >= PartSz) {
6781 SecondVecFound =
true;
6787 if (
static_cast<unsigned>(
I * PartSz + Idx) >= CurrentOrder.
size())
6789 if (CurrentOrder[
I * PartSz + Idx] >
6790 static_cast<unsigned>(
I * PartSz + K) &&
6791 CurrentOrder[
I * PartSz + Idx] !=
6792 static_cast<unsigned>(
I * PartSz + Idx))
6793 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
6796 if (SecondVecFound) {
6798 ShuffledSubMasks.
set(
I);
6804 if (!ExtractShuffles.
empty())
6805 TransformMaskToOrder(
6806 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
6807 if (
I >= ExtractShuffles.
size() || !ExtractShuffles[
I])
6810 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
6812 int K =
I * PartSz + Idx;
6813 if (
static_cast<unsigned>(K) >= ExtractMask.
size())
6817 if (!TE.ReuseShuffleIndices.empty())
6818 K = TE.ReuseShuffleIndices[K];
6821 if (!TE.ReorderIndices.empty())
6822 K = std::distance(TE.ReorderIndices.begin(),
6823 find(TE.ReorderIndices, K));
6827 VF = std::max(VF, EI->getVectorOperandType()
6829 .getKnownMinValue());
6834 if (GatherShuffles.
size() == 1 && NumParts != 1) {
6835 if (ShuffledSubMasks.
any())
6836 return std::nullopt;
6837 PartSz = NumScalars;
6840 if (!Entries.
empty())
6841 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
6842 if (
I >= GatherShuffles.
size() || !GatherShuffles[
I])
6844 return std::max(Entries[
I].front()->getVectorFactor(),
6845 Entries[
I].back()->getVectorFactor());
6847 unsigned NumUndefs =
count(CurrentOrder, NumScalars);
6848 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6849 return std::nullopt;
6850 return std::move(CurrentOrder);
6855 bool CompareOpcodes =
true) {
6861 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6862 (!GEP2 || GEP2->getNumOperands() == 2) &&
6863 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
6864 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
6867 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
6871template <
typename T>
6876 return CommonAlignment;
6882 "Order is empty. Please check it before using isReverseOrder.");
6883 unsigned Sz = Order.
size();
6885 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6906 "Coeffs vector needs to be of correct size");
6908 const SCEV *PtrSCEVLowest =
nullptr;
6909 const SCEV *PtrSCEVHighest =
nullptr;
6912 for (
Value *Ptr : PointerOps) {
6917 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6918 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6925 PtrSCEVLowest = PtrSCEV;
6932 PtrSCEVHighest = PtrSCEV;
6940 int Size =
DL.getTypeStoreSize(ElemTy);
6941 auto TryGetStride = [&](
const SCEV *Dist,
6942 const SCEV *Multiplier) ->
const SCEV * {
6944 if (M->getOperand(0) == Multiplier)
6945 return M->getOperand(1);
6946 if (M->getOperand(1) == Multiplier)
6947 return M->getOperand(0);
6950 if (Multiplier == Dist)
6955 const SCEV *Stride =
nullptr;
6956 if (
Size != 1 || SCEVs.
size() > 1) {
6958 Stride = TryGetStride(Dist, Sz);
6966 using DistOrdPair = std::pair<int64_t, int>;
6968 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
6970 bool IsConsecutive =
true;
6971 for (
const auto [Idx, PtrSCEV] :
enumerate(SCEVs)) {
6973 if (PtrSCEV != PtrSCEVLowest) {
6975 const SCEV *Coeff = TryGetStride(Diff, Stride);
6981 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6986 Dist = SC->getAPInt().getZExtValue();
6993 auto Res = Offsets.emplace(Dist, Cnt);
6997 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
7000 if (Offsets.size() != SCEVs.
size())
7002 SortedIndices.
clear();
7003 if (!IsConsecutive) {
7007 for (
const std::pair<int64_t, int> &Pair : Offsets) {
7008 SortedIndices[Cnt] = Pair.second;
7015static std::pair<InstructionCost, InstructionCost>
7034 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
7036 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7039 Mask, NumSrcElts, NumSubElts, Index)) {
7040 if (Index + NumSubElts > NumSrcElts &&
7041 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
7045 return TTI.getShuffleCost(Kind, DstTy, Tp, Mask,
CostKind, Index, SubTp,
7054 const APInt &DemandedElts,
bool Insert,
bool Extract,
7059 "ScalableVectorType is not supported.");
7062 "Incorrect usage.");
7067 unsigned ScalarTyNumElements = VecTy->getNumElements();
7070 if (!DemandedElts[
I])
7074 I * ScalarTyNumElements, VecTy);
7077 I * ScalarTyNumElements, VecTy);
7081 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
7090 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
7091 if (Opcode == Instruction::ExtractElement) {
7097 Index * VecTy->getNumElements(), VecTy);
7100 return TTI.getVectorInstrCost(Opcode, Val,
CostKind, Index, Scalar,
7113 getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
7115 Index * ScalarTy->getNumElements(), SubTp) +
7119 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index,
CostKind);
7135 auto *Begin = std::next(
Mask.begin(), Index);
7136 std::iota(Begin, std::next(Begin, SubVecVF), 0);
7137 Vec = Builder.CreateShuffleVector(V, Mask);
7140 std::iota(
Mask.begin(),
Mask.end(), 0);
7141 std::iota(std::next(
Mask.begin(), Index),
7142 std::next(
Mask.begin(), Index + SubVecVF), VecVF);
7144 return Generator(Vec, V, Mask);
7147 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
7148 V = Builder.CreateShuffleVector(V, ResizeMask);
7150 return Builder.CreateShuffleVector(Vec, V, Mask);
7155 unsigned SubVecVF,
unsigned Index) {
7157 std::iota(Mask.begin(), Mask.end(), Index);
7158 return Builder.CreateShuffleVector(Vec, Mask);
7168 const unsigned Sz = PointerOps.
size();
7171 CompressMask[0] = 0;
7173 std::optional<unsigned> Stride = 0;
7176 Value *Ptr = Order.
empty() ? PointerOps[
I] : PointerOps[Order[
I]];
7177 std::optional<int64_t> OptPos =
7179 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
7181 unsigned Pos =
static_cast<unsigned>(*OptPos);
7182 CompressMask[
I] = Pos;
7189 if (Pos != *Stride *
I)
7192 return Stride.has_value();
7205 InterleaveFactor = 0;
7207 const size_t Sz = VL.
size();
7215 if (AreAllUsersVectorized(V))
7218 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind,
7219 Mask.empty() ?
I : Mask[
I]);
7222 if (ExtractCost <= ScalarCost)
7227 if (Order.
empty()) {
7228 Ptr0 = PointerOps.
front();
7229 PtrN = PointerOps.
back();
7231 Ptr0 = PointerOps[Order.
front()];
7232 PtrN = PointerOps[Order.
back()];
7234 std::optional<int64_t> Diff =
7238 const size_t MaxRegSize =
7242 if (*Diff / Sz >= MaxRegSize / 8)
7246 Align CommonAlignment = LI->getAlign();
7248 Ptr0, LoadVecTy, CommonAlignment,
DL,
7251 if (IsMasked && !
TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment,
7252 LI->getPointerAddressSpace()))
7258 assert(CompressMask.
size() >= 2 &&
"At least two elements are required");
7262 auto [ScalarGEPCost, VectorGEPCost] =
7264 Instruction::Load,
CostKind, ScalarTy, LoadVecTy);
7281 LoadCost =
TTI.getMemIntrinsicInstrCost(
7284 LI->getPointerAddressSpace()),
7288 TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
7289 LI->getPointerAddressSpace(),
CostKind);
7291 if (IsStrided && !IsMasked && Order.
empty()) {
7298 AlignedLoadVecTy = LoadVecTy;
7299 if (
TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
7301 LI->getPointerAddressSpace())) {
7303 VectorGEPCost +
TTI.getInterleavedMemoryOpCost(
7304 Instruction::Load, AlignedLoadVecTy,
7305 CompressMask[1], {}, CommonAlignment,
7306 LI->getPointerAddressSpace(),
CostKind, IsMasked);
7307 if (InterleavedCost < GatherCost) {
7308 InterleaveFactor = CompressMask[1];
7309 LoadVecTy = AlignedLoadVecTy;
7316 if (!Order.
empty()) {
7319 NewMask[
I] = CompressMask[Mask[
I]];
7321 CompressMask.
swap(NewMask);
7323 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7324 return TotalVecCost < GatherCost;
7337 unsigned InterleaveFactor;
7341 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7342 CompressMask, LoadVecTy);
7359 Align Alignment,
const int64_t Diff,
7360 const size_t Sz)
const {
7361 if (Diff % (Sz - 1) != 0)
7365 auto IsAnyPointerUsedOutGraph =
any_of(PointerOps, [&](
Value *V) {
7367 return !isVectorized(U) && !MustGather.contains(U);
7371 const uint64_t AbsoluteDiff = std::abs(Diff);
7373 if (IsAnyPointerUsedOutGraph ||
7374 (AbsoluteDiff > Sz &&
7378 Diff == -(
static_cast<int64_t
>(Sz) - 1)) {
7379 int64_t Stride = Diff /
static_cast<int64_t
>(Sz - 1);
7380 if (Diff != Stride *
static_cast<int64_t
>(Sz - 1))
7382 if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
7393 const size_t Sz = PointerOps.
size();
7401 SortedIndices.
empty() ? PointerOps[
I] : PointerOps[SortedIndices[
I]];
7402 std::optional<int64_t>
Offset =
7404 assert(
Offset &&
"sortPtrAccesses should have validated this pointer");
7405 SortedOffsetsFromBase[
I] = *
Offset;
7422 int64_t StrideWithinGroup =
7423 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7426 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](
unsigned Idx) {
7427 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7432 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7434 unsigned VecSz = Sz;
7435 Type *NewScalarTy = ScalarTy;
7439 bool NeedsWidening = Sz != GroupSize;
7440 const uint64_t UnitBitWidth = DL->getTypeSizeInBits(ScalarTy).getFixedValue();
7441 if (NeedsWidening) {
7442 if (Sz % GroupSize != 0)
7445 if (StrideWithinGroup != 1)
7447 VecSz = Sz / GroupSize;
7448 NewScalarTy =
Type::getIntNTy(SE->getContext(), UnitBitWidth * GroupSize);
7453 if (!
isStridedLoad(PointerOps, NewScalarTy, Alignment, Diff, VecSz))
7456 int64_t StrideIntVal = StrideWithinGroup;
7457 if (NeedsWidening) {
7460 unsigned CurrentGroupStartIdx = GroupSize;
7461 int64_t StrideBetweenGroups =
7462 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7463 StrideIntVal = StrideBetweenGroups;
7464 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7465 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7466 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7467 StrideBetweenGroups)
7471 auto CheckGroup = [=](
const unsigned StartIdx) ->
bool {
7474 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7475 return GroupEndIdx - StartIdx == GroupSize;
7477 for (
unsigned I = 0;
I < Sz;
I += GroupSize) {
7483 Type *StrideTy = DL->getIndexType(Ptr0->
getType());
7493 bool IsLoad)
const {
7499 OffsetToPointerOpIdxMap;
7500 for (
auto [Idx, Ptr] :
enumerate(PointerOps)) {
7501 const SCEV *PtrSCEV = SE->getSCEV(Ptr);
7513 Offset = SC->getAPInt().getSExtValue();
7514 if (
Offset >= std::numeric_limits<int64_t>::max() - 1) {
7521 OffsetToPointerOpIdxMap[
Offset].first.push_back(Ptr);
7522 OffsetToPointerOpIdxMap[
Offset].second.push_back(Idx);
7524 unsigned NumOffsets = OffsetToPointerOpIdxMap.
size();
7528 const unsigned Sz = PointerOps.
size();
7529 unsigned VecSz = Sz;
7530 Type *NewScalarTy = BaseTy;
7531 if (NumOffsets > 1) {
7532 if (Sz % NumOffsets != 0)
7534 VecSz = Sz / NumOffsets;
7539 DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
7541 unsigned MinProfitableStridedOps =
7544 if (Sz * BaseTyNumElts < MinProfitableStridedOps ||
7545 !TTI->isTypeLegal(StridedLoadTy) ||
7546 !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment))
7552 for (
auto [Idx, MapPair] :
enumerate(OffsetToPointerOpIdxMap)) {
7553 if (MapPair.second.first.size() != VecSz)
7555 SortedOffsetsV[Idx] = MapPair.first;
7557 sort(SortedOffsetsV);
7559 if (NumOffsets > 1) {
7560 int64_t BaseBytes = DL->getTypeStoreSize(BaseTy);
7562 if (SortedOffsetsV[
I] - SortedOffsetsV[
I - 1] != BaseBytes)
7635 auto UpdateSortedIndices =
7638 if (SortedIndicesForOffset.
empty()) {
7639 SortedIndicesForOffset.
resize(IndicesInAllPointerOps.
size());
7640 std::iota(SortedIndicesForOffset.
begin(),
7641 SortedIndicesForOffset.
end(), 0);
7643 for (
const auto [Num, Idx] :
enumerate(SortedIndicesForOffset)) {
7644 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7645 IndicesInAllPointerOps[Idx];
7649 int64_t LowestOffset = SortedOffsetsV[0];
7655 SortedIndicesForOffset0, Coeffs0);
7658 unsigned NumCoeffs0 = Coeffs0.
size();
7659 if (NumCoeffs0 * NumOffsets != Sz)
7664 OffsetToPointerOpIdxMap[LowestOffset].second;
7665 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7671 for (
int J :
seq<int>(1, NumOffsets)) {
7674 SortedIndicesForOffset.
clear();
7676 int64_t
Offset = SortedOffsetsV[J];
7678 OffsetToPointerOpIdxMap[
Offset].first;
7680 OffsetToPointerOpIdxMap[
Offset].second;
7682 PointerOpsForOffset, BaseTy, *DL, *SE, SortedIndicesForOffset, Coeffs);
7684 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7686 if (Coeffs.
size() != NumCoeffs0)
7689 if (Coeffs != Coeffs0)
7692 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7695 SortedIndices.
clear();
7696 SortedIndices = std::move(SortedIndicesDraft);
7698 SPtrInfo.
Ty = StridedLoadTy;
7705 unsigned *BestVF,
bool TryRecursiveCheck)
const {
7718 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
7724 const size_t Sz = VL.
size();
7726 auto *POIter = PointerOps.
begin();
7727 for (
Value *V : VL) {
7729 if (!L || !L->isSimple())
7731 *POIter = L->getPointerOperand();
7737 bool IsSorted =
sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
7744 std::optional<bool> MaskedGatherLegal;
7745 auto IsMaskedGatherLegal = [&] {
7746 if (!MaskedGatherLegal)
7748 TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
7749 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment);
7750 return *MaskedGatherLegal;
7757 if (!IsMaskedGatherLegal())
7768 if (Order.
empty()) {
7769 Ptr0 = PointerOps.
front();
7770 PtrN = PointerOps.
back();
7772 Ptr0 = PointerOps[Order.
front()];
7773 PtrN = PointerOps[Order.
back()];
7778 std::optional<int64_t> Diff0 =
7780 std::optional<int64_t> DiffN =
7783 "sortPtrAccesses should have validated these pointers");
7784 int64_t Diff = *DiffN - *Diff0;
7786 if (
static_cast<uint64_t>(Diff) == Sz - 1)
7789 *TLI, [&](
Value *V) {
7790 return areAllUsersVectorized(
7798 Diff, Ptr0, SPtrInfo))
7801 if (!IsMaskedGatherLegal())
7806 auto CheckForShuffledLoads = [&, &TTI = *TTI](
Align CommonAlignment,
7808 bool ProfitableGatherPointers) {
7813 auto [ScalarGEPCost, VectorGEPCost] =
7819 Type *PtrScalarTy = PointerOps.
front()->getType()->getScalarType();
7825 if (
static_cast<unsigned>(
count_if(
7848 TTI.getMemIntrinsicInstrCost(
7851 false, CommonAlignment),
7853 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7861 constexpr unsigned ListLimit = 4;
7862 if (!TryRecursiveCheck || VL.
size() < ListLimit)
7865 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
7875 for (
unsigned Cnt = 0, End = VL.
size(); Cnt < End; Cnt += VF) {
7876 const unsigned SliceVF = std::min(VF, End - Cnt);
7881 PointerOps, SPtrInfo, BestVF,
7889 DemandedElts.
setBits(Cnt, Cnt + SliceVF);
7905 if (!DemandedElts.
isZero()) {
7911 if (DemandedElts[Idx])
7915 for (
const auto &[SliceStart, LS] : States) {
7916 const unsigned SliceVF = std::min<unsigned>(VF, VL.
size() - SliceStart);
7923 ArrayRef(PointerOps).slice(SliceStart, SliceVF),
7924 LI0->getPointerOperand(), Instruction::Load,
7928 if (
static_cast<unsigned>(
7930 PointerOps.
size() - 1 ||
7948 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
7949 LI0->getPointerAddressSpace(),
CostKind,
7954 VecLdCost += TTI.getMemIntrinsicInstrCost(
7956 Intrinsic::experimental_vp_strided_load,
7957 SubVecTy, LI0->getPointerOperand(),
7958 false, CommonAlignment),
7963 VecLdCost += TTI.getMemIntrinsicInstrCost(
7965 Intrinsic::masked_load, SubVecTy,
7966 CommonAlignment, LI0->getPointerAddressSpace()),
7972 VecLdCost += TTI.getMemIntrinsicInstrCost(
7974 Intrinsic::masked_gather, SubVecTy,
7975 LI0->getPointerOperand(),
7976 false, CommonAlignment),
7984 const unsigned SliceIdx = SliceStart / VF;
7986 ShuffleMask[Idx] = Idx / VF == SliceIdx ? VL.
size() + Idx % VF : Idx;
7995 if (MaskedGatherCost >= VecLdCost &&
8008 bool ProfitableGatherPointers =
8009 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
8010 return L->isLoopInvariant(V);
8012 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
8015 (
GEP &&
GEP->getNumOperands() == 2 &&
8023 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
8024 ProfitableGatherPointers))
8036 all_of(VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
8037 "Expected list of pointer operands.");
8042 std::pair<BasicBlock *, Value *>,
8046 .try_emplace(std::make_pair(
8050 SortedIndices.
clear();
8052 auto Key = std::make_pair(BBs[Cnt + 1],
8054 bool Found =
any_of(Bases.try_emplace(
Key).first->second,
8055 [&, &Cnt = Cnt, &Ptr = Ptr](
auto &
Base) {
8056 std::optional<int64_t> Diff =
8057 getPointersDiff(ElemTy, std::get<0>(Base.front()),
8058 ElemTy, Ptr, DL, SE,
8063 Base.emplace_back(Ptr, *Diff, Cnt + 1);
8069 if (Bases.size() > VL.
size() / 2 - 1)
8073 Bases.find(
Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
8077 if (Bases.size() == VL.
size())
8080 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
8081 Bases.front().second.size() == VL.
size()))
8086 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
8095 FirstPointers.
insert(P1);
8096 SecondPointers.
insert(P2);
8102 "Unable to find matching root.");
8105 for (
auto &
Base : Bases) {
8106 for (
auto &Vec :
Base.second) {
8107 if (Vec.size() > 1) {
8109 int64_t InitialOffset = std::get<1>(Vec[0]);
8110 bool AnyConsecutive =
8112 return std::get<1>(
P.value()) ==
8113 int64_t(
P.index()) + InitialOffset;
8117 if (!AnyConsecutive)
8122 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
8126 for (
auto &
T : Bases)
8127 for (
const auto &Vec :
T.second)
8128 for (
const auto &
P : Vec)
8132 "Expected SortedIndices to be the size of VL");
8136std::optional<BoUpSLP::OrdersType>
8138 assert(TE.isGather() &&
"Expected gather node only.");
8139 Type *ScalarTy = TE.Scalars[0]->getType();
8142 Ptrs.
reserve(TE.Scalars.size());
8144 BBs.
reserve(TE.Scalars.size());
8145 for (
Value *V : TE.Scalars) {
8147 if (!L || !L->isSimple())
8148 return std::nullopt;
8154 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
8156 return std::move(Order);
8157 return std::nullopt;
8168 if (VU->
getType() != V->getType())
8171 if (!VU->
hasOneUse() && !V->hasOneUse())
8177 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
8183 bool IsReusedIdx =
false;
8185 if (IE2 == VU && !IE1)
8187 if (IE1 == V && !IE2)
8188 return V->hasOneUse();
8189 if (IE1 && IE1 != V) {
8191 IsReusedIdx |= ReusedIdx.
test(Idx1);
8192 ReusedIdx.
set(Idx1);
8193 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
8198 if (IE2 && IE2 != VU) {
8200 IsReusedIdx |= ReusedIdx.
test(Idx2);
8201 ReusedIdx.
set(Idx2);
8202 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
8207 }
while (!IsReusedIdx && (IE1 || IE2));
8217std::optional<BoUpSLP::OrdersType>
8219 bool IgnoreReorder) {
8222 if (!TE.ReuseShuffleIndices.empty()) {
8224 return std::nullopt;
8232 unsigned Sz = TE.Scalars.size();
8233 if (TE.isGather()) {
8234 if (std::optional<OrdersType> CurrentOrder =
8239 ::addMask(Mask, TE.ReuseShuffleIndices);
8240 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
8241 unsigned Sz = TE.Scalars.size();
8242 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
8245 Res[Idx + K * Sz] =
I + K * Sz;
8247 return std::move(Res);
8250 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8253 2 * TE.getVectorFactor()),
8255 return std::nullopt;
8256 if (TE.ReuseShuffleIndices.size() % Sz != 0)
8257 return std::nullopt;
8261 if (TE.ReorderIndices.empty())
8262 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
8265 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
8266 unsigned VF = ReorderMask.
size();
8270 for (
unsigned I = 0;
I < VF;
I += Sz) {
8272 unsigned UndefCnt = 0;
8273 unsigned Limit = std::min(Sz, VF -
I);
8283 UsedVals.
test(Val) || UndefCnt > Sz / 2)
8284 return std::nullopt;
8286 for (
unsigned K = 0; K < NumParts; ++K) {
8287 unsigned Idx = Val + Sz * K;
8288 if (Idx < VF &&
I + K < VF)
8289 ResOrder[Idx] =
I + K;
8292 return std::move(ResOrder);
8294 unsigned VF = TE.getVectorFactor();
8297 TE.ReuseShuffleIndices.end());
8298 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8300 if (isa<PoisonValue>(V))
8302 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
8303 return Idx && *Idx < Sz;
8305 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
8306 "by BinaryOperator and CastInst.");
8308 if (TE.ReorderIndices.empty())
8309 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
8312 for (
unsigned I = 0;
I < VF; ++
I) {
8313 int &Idx = ReusedMask[
I];
8316 Value *V = TE.Scalars[ReorderMask[Idx]];
8318 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
8324 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
8325 auto *It = ResOrder.
begin();
8326 for (
unsigned K = 0; K < VF; K += Sz) {
8330 std::iota(SubMask.
begin(), SubMask.
end(), 0);
8332 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
8333 std::advance(It, Sz);
8336 return Data.index() ==
Data.value();
8338 return std::nullopt;
8339 return std::move(ResOrder);
8341 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8342 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8344 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
8345 return std::nullopt;
8346 if (TE.State == TreeEntry::SplitVectorize ||
8347 ((TE.State == TreeEntry::Vectorize ||
8348 TE.State == TreeEntry::StridedVectorize ||
8349 TE.State == TreeEntry::CompressVectorize) &&
8352 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8353 "Alternate instructions are only supported by "
8354 "BinaryOperator and CastInst.");
8355 return TE.ReorderIndices;
8357 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8358 TE.isAltShuffle()) {
8359 assert(TE.ReuseShuffleIndices.empty() &&
8360 "ReuseShuffleIndices should be "
8361 "empty for alternate instructions.");
8363 TE.buildAltOpShuffleMask(
8365 assert(TE.getMatchingMainOpOrAltOp(
I) &&
8366 "Unexpected main/alternate opcode");
8370 const int VF = TE.getVectorFactor();
8375 ResOrder[Mask[
I] % VF] =
I;
8377 return std::move(ResOrder);
8379 if (!TE.ReorderIndices.empty())
8380 return TE.ReorderIndices;
8381 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8382 if (!TE.ReorderIndices.empty())
8383 return TE.ReorderIndices;
8386 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
8394 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
8402 assert(BB1 != BB2 &&
"Expected different basic blocks.");
8403 if (!DT->isReachableFromEntry(BB1))
8405 if (!DT->isReachableFromEntry(BB2))
8407 auto *NodeA = DT->getNode(BB1);
8408 auto *NodeB = DT->getNode(BB2);
8409 assert(NodeA &&
"Should only process reachable instructions");
8410 assert(NodeB &&
"Should only process reachable instructions");
8411 assert((NodeA == NodeB) ==
8412 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8413 "Different nodes should have different DFS numbers");
8414 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8416 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
8417 Value *V1 = TE.Scalars[I1];
8418 Value *V2 = TE.Scalars[I2];
8431 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8432 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8433 FirstUserOfPhi2->getParent());
8443 if (UserBVHead[I1] && !UserBVHead[I2])
8445 if (!UserBVHead[I1])
8447 if (UserBVHead[I1] == UserBVHead[I2])
8450 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
8452 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
8465 if (EE1->getOperand(0) == EE2->getOperand(0))
8467 if (!Inst1 && Inst2)
8469 if (Inst1 && Inst2) {
8477 "Expected either instructions or arguments vector operands.");
8478 return P1->getArgNo() < P2->getArgNo();
8483 std::iota(Phis.
begin(), Phis.
end(), 0);
8486 return std::nullopt;
8487 return std::move(Phis);
8489 if (TE.isGather() &&
8490 (!TE.hasState() || !TE.isAltShuffle() ||
8491 ScalarsInSplitNodes.contains(TE.getMainOp())) &&
8495 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8499 auto *EE = dyn_cast<ExtractElementInst>(V);
8500 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
8506 canReuseExtract(TE.Scalars, CurrentOrder,
true);
8507 if (Reuse || !CurrentOrder.
empty())
8508 return std::move(CurrentOrder);
8516 int Sz = TE.Scalars.size();
8520 if (It == TE.Scalars.begin())
8523 if (It != TE.Scalars.end()) {
8525 unsigned Idx = std::distance(TE.Scalars.begin(), It);
8540 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8543 return std::move(Order);
8548 return std::nullopt;
8549 if (TE.Scalars.size() >= 3)
8554 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8559 CurrentOrder, PointerOps, SPtrInfo);
8562 return std::move(CurrentOrder);
8564 if (std::optional<OrdersType> CurrentOrder =
8566 return CurrentOrder;
8568 return std::nullopt;
8578 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
8580 if (Cluster != FirstCluster)
8586void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
8589 const unsigned Sz =
TE.Scalars.size();
8591 if (!
TE.isGather() ||
8598 addMask(NewMask,
TE.ReuseShuffleIndices);
8600 TE.ReorderIndices.clear();
8607 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
8608 *End =
TE.ReuseShuffleIndices.end();
8609 It != End; std::advance(It, Sz))
8610 std::iota(It, std::next(It, Sz), 0);
8616 "Expected same size of orders");
8617 size_t Sz = Order.
size();
8620 if (Order[Idx] != Sz)
8621 UsedIndices.
set(Order[Idx]);
8623 if (SecondaryOrder.
empty()) {
8625 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
8629 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8630 !UsedIndices.
test(SecondaryOrder[Idx]))
8631 Order[Idx] = SecondaryOrder[Idx];
8639 constexpr unsigned TinyVF = 2;
8640 constexpr unsigned TinyTree = 10;
8641 constexpr unsigned PhiOpsLimit = 12;
8642 constexpr unsigned GatherLoadsLimit = 2;
8643 if (VectorizableTree.size() <= TinyTree)
8645 if (VectorizableTree.front()->hasState() &&
8646 !VectorizableTree.front()->isGather() &&
8647 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8648 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8649 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8650 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8651 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8652 VectorizableTree.front()->ReorderIndices.empty()) {
8656 if (VectorizableTree.front()->hasState() &&
8657 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8658 VectorizableTree.front()->Scalars.size() == TinyVF &&
8659 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8662 if (VectorizableTree.front()->hasState() &&
8663 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8664 VectorizableTree.front()->ReorderIndices.empty()) {
8665 const unsigned ReorderedSplitsCnt =
8666 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8667 return TE->State == TreeEntry::SplitVectorize &&
8668 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8669 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8672 if (ReorderedSplitsCnt <= 1 &&
8674 VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
8675 return ((!TE->isGather() &&
8676 (TE->ReorderIndices.empty() ||
8677 (TE->UserTreeIndex.UserTE &&
8678 TE->UserTreeIndex.UserTE->State ==
8679 TreeEntry::Vectorize &&
8680 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8682 (TE->isGather() && TE->ReorderIndices.empty() &&
8683 (!TE->hasState() || TE->isAltShuffle() ||
8684 TE->getOpcode() == Instruction::Load ||
8685 TE->getOpcode() == Instruction::ZExt ||
8686 TE->getOpcode() == Instruction::SExt))) &&
8687 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8688 !TE->isGather() ||
none_of(TE->Scalars, [&](
Value *V) {
8689 return !isConstant(V) && isVectorized(V);
8691 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8694 bool HasPhis =
false;
8695 bool HasLoad =
true;
8696 unsigned GatherLoads = 0;
8697 for (
const std::unique_ptr<TreeEntry> &TE :
8698 ArrayRef(VectorizableTree).drop_front()) {
8699 if (TE->State == TreeEntry::SplitVectorize)
8701 if (!TE->hasState()) {
8705 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8710 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8711 if (!TE->isGather()) {
8718 if (GatherLoads >= GatherLoadsLimit)
8721 if (TE->getOpcode() == Instruction::GetElementPtr ||
8724 if (TE->getOpcode() != Instruction::PHI &&
8725 (!TE->hasCopyableElements() ||
8727 TE->Scalars.size() / 2))
8729 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8730 TE->getNumOperands() > PhiOpsLimit)
8739void BoUpSLP::TreeEntry::reorderSplitNode(
unsigned Idx,
ArrayRef<int> Mask,
8741 assert(State == TreeEntry::SplitVectorize &&
"Expected split user node.");
8744 std::iota(NewMask.
begin(), NewMask.
end(), 0);
8745 std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
8748 copy(MaskOrder, NewMaskOrder.begin());
8750 assert(Idx == 1 &&
"Expected either 0 or 1 index.");
8751 unsigned Offset = CombinedEntriesWithIndices.
back().second;
8760 ReorderIndices.clear();
8779 ExternalUserReorderMap;
8782 const bool IgnoreReorder =
8783 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8784 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8785 VectorizableTree.front()->getOpcode() == Instruction::Store);
8789 for_each(VectorizableTree, [&, &TTIRef = *TTI](
8790 const std::unique_ptr<TreeEntry> &TE) {
8793 findExternalStoreUsersReorderIndices(TE.get());
8794 if (!ExternalUserReorderIndices.
empty()) {
8795 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8797 std::move(ExternalUserReorderIndices));
8803 if (TE->hasState() && TE->isAltShuffle() &&
8804 TE->State != TreeEntry::SplitVectorize) {
8805 Type *ScalarTy = TE->Scalars[0]->getType();
8807 unsigned Opcode0 = TE->getOpcode();
8808 unsigned Opcode1 = TE->getAltOpcode();
8812 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8813 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8819 if (std::optional<OrdersType> CurrentOrder =
8829 const TreeEntry *UserTE = TE.get();
8831 if (!UserTE->UserTreeIndex)
8833 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8834 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8835 UserTE->UserTreeIndex.UserTE->Idx != 0)
8837 UserTE = UserTE->UserTreeIndex.UserTE;
8840 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
8841 if (!(TE->State == TreeEntry::Vectorize ||
8842 TE->State == TreeEntry::StridedVectorize ||
8843 TE->State == TreeEntry::SplitVectorize ||
8844 TE->State == TreeEntry::CompressVectorize) ||
8845 !TE->ReuseShuffleIndices.empty())
8846 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
8847 if (TE->State == TreeEntry::Vectorize &&
8848 TE->getOpcode() == Instruction::PHI)
8849 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
8854 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
8855 !VFToOrderedEntries.
empty() && VF > 1; --VF) {
8856 auto It = VFToOrderedEntries.
find(VF);
8857 if (It == VFToOrderedEntries.
end())
8871 for (
const TreeEntry *OpTE : OrderedEntries) {
8874 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE) &&
8875 OpTE->State != TreeEntry::SplitVectorize)
8878 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8880 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8881 auto It = GathersToOrders.find(OpTE);
8882 if (It != GathersToOrders.end())
8885 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8886 auto It = AltShufflesToOrders.find(OpTE);
8887 if (It != AltShufflesToOrders.end())
8890 if (OpTE->State == TreeEntry::Vectorize &&
8891 OpTE->getOpcode() == Instruction::PHI) {
8892 auto It = PhisToOrders.
find(OpTE);
8893 if (It != PhisToOrders.
end())
8896 return OpTE->ReorderIndices;
8899 auto It = ExternalUserReorderMap.
find(OpTE);
8900 if (It != ExternalUserReorderMap.
end()) {
8901 const auto &ExternalUserReorderIndices = It->second;
8905 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8906 OrdersUses.try_emplace(
OrdersType(), 0).first->second +=
8907 ExternalUserReorderIndices.size();
8909 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
8910 ++OrdersUses.try_emplace(ExtOrder, 0).first->second;
8917 if (OpTE->State == TreeEntry::Vectorize &&
8918 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
8919 assert(!OpTE->isAltShuffle() &&
8920 "Alternate instructions are only supported by BinaryOperator "
8924 unsigned E = Order.
size();
8927 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8930 ++OrdersUses.try_emplace(CurrentOrder, 0).first->second;
8932 ++OrdersUses.try_emplace(Order, 0).first->second;
8935 if (OrdersUses.empty())
8938 unsigned IdentityCnt = 0;
8939 unsigned FilledIdentityCnt = 0;
8941 for (
auto &Pair : OrdersUses) {
8943 if (!Pair.first.empty())
8944 FilledIdentityCnt += Pair.second;
8945 IdentityCnt += Pair.second;
8950 unsigned Cnt = IdentityCnt;
8951 for (
auto &Pair : OrdersUses) {
8955 if (Cnt < Pair.second ||
8956 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8957 Cnt == Pair.second && !BestOrder.
empty() &&
8960 BestOrder = Pair.first;
8973 unsigned E = BestOrder.
size();
8975 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8978 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8980 if (TE->Scalars.size() != VF) {
8981 if (TE->ReuseShuffleIndices.size() == VF) {
8982 assert(TE->State != TreeEntry::SplitVectorize &&
8983 "Split vectorized not expected.");
8988 (!TE->UserTreeIndex ||
8989 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8990 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8991 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8992 "All users must be of VF size.");
8999 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
9005 reorderNodeWithReuses(*TE, Mask);
9007 if (TE->UserTreeIndex &&
9008 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9009 TE->UserTreeIndex.UserTE->reorderSplitNode(
9010 TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
9014 if ((TE->State == TreeEntry::SplitVectorize &&
9015 TE->ReuseShuffleIndices.empty()) ||
9016 ((TE->State == TreeEntry::Vectorize ||
9017 TE->State == TreeEntry::StridedVectorize ||
9018 TE->State == TreeEntry::CompressVectorize) &&
9023 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
9024 TE->ReuseShuffleIndices.empty())) &&
9025 "Alternate instructions are only supported by BinaryOperator "
9031 TE->reorderOperands(Mask);
9034 TE->reorderOperands(Mask);
9035 assert(TE->ReorderIndices.empty() &&
9036 "Expected empty reorder sequence.");
9039 if (!TE->ReuseShuffleIndices.empty()) {
9046 addMask(NewReuses, TE->ReuseShuffleIndices);
9047 TE->ReuseShuffleIndices.swap(NewReuses);
9048 }
else if (TE->UserTreeIndex &&
9049 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
9051 TE->UserTreeIndex.UserTE->reorderSplitNode(TE->UserTreeIndex.EdgeIdx,
9057void BoUpSLP::buildReorderableOperands(
9058 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
9062 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
9063 return OpData.first ==
I &&
9064 (OpData.second->State == TreeEntry::Vectorize ||
9065 OpData.second->State == TreeEntry::StridedVectorize ||
9066 OpData.second->State == TreeEntry::CompressVectorize ||
9067 OpData.second->State == TreeEntry::SplitVectorize);
9071 if (UserTE->hasState()) {
9072 if (UserTE->getOpcode() == Instruction::ExtractElement ||
9073 UserTE->getOpcode() == Instruction::ExtractValue)
9075 if (UserTE->getOpcode() == Instruction::InsertElement &&
I == 0)
9077 if (UserTE->getOpcode() == Instruction::Store &&
I == 1 &&
9078 (UserTE->State == TreeEntry::Vectorize ||
9079 UserTE->State == TreeEntry::StridedVectorize))
9081 if (UserTE->getOpcode() == Instruction::Load &&
9082 (UserTE->State == TreeEntry::Vectorize ||
9083 UserTE->State == TreeEntry::StridedVectorize ||
9084 UserTE->State == TreeEntry::CompressVectorize))
9087 TreeEntry *
TE = getOperandEntry(UserTE,
I);
9088 assert(TE &&
"Expected operand entry.");
9089 if (!
TE->isGather()) {
9092 Edges.emplace_back(
I, TE);
9098 if (
TE->State == TreeEntry::ScatterVectorize &&
9099 TE->ReuseShuffleIndices.empty() &&
TE->ReorderIndices.empty())
9103 if (ReorderableGathers.
contains(TE))
9109 struct TreeEntryCompare {
9110 bool operator()(
const TreeEntry *LHS,
const TreeEntry *RHS)
const {
9111 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
9112 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
9113 return LHS->Idx < RHS->Idx;
9122 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9123 if (TE->State != TreeEntry::Vectorize &&
9124 TE->State != TreeEntry::StridedVectorize &&
9125 TE->State != TreeEntry::CompressVectorize &&
9126 TE->State != TreeEntry::SplitVectorize)
9127 NonVectorized.
insert(TE.get());
9128 if (std::optional<OrdersType> CurrentOrder =
9130 Queue.push(TE.get());
9131 if (!(TE->State == TreeEntry::Vectorize ||
9132 TE->State == TreeEntry::StridedVectorize ||
9133 TE->State == TreeEntry::CompressVectorize ||
9134 TE->State == TreeEntry::SplitVectorize) ||
9135 !TE->ReuseShuffleIndices.empty())
9136 GathersToOrders.
insert(TE.get());
9145 while (!Queue.empty()) {
9147 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>
Users;
9148 TreeEntry *TE = Queue.top();
9149 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
9152 while (!Queue.empty()) {
9154 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
9159 for (TreeEntry *TE : OrderedOps) {
9160 if (!(TE->State == TreeEntry::Vectorize ||
9161 TE->State == TreeEntry::StridedVectorize ||
9162 TE->State == TreeEntry::CompressVectorize ||
9163 TE->State == TreeEntry::SplitVectorize ||
9164 (TE->isGather() && GathersToOrders.
contains(TE))) ||
9165 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
9166 !Visited.
insert(TE).second)
9170 Users.first = TE->UserTreeIndex.UserTE;
9171 Users.second.emplace_back(TE->UserTreeIndex.EdgeIdx, TE);
9175 if (
Data.first->State == TreeEntry::SplitVectorize) {
9177 Data.second.size() <= 2 &&
9178 "Expected not greater than 2 operands for split vectorize node.");
9180 [](
const auto &
Op) { return !Op.second->UserTreeIndex; }))
9183 assert(
Data.first->CombinedEntriesWithIndices.size() == 2 &&
9184 "Expected exactly 2 entries.");
9185 for (
const auto &
P :
Data.first->CombinedEntriesWithIndices) {
9186 TreeEntry &OpTE = *VectorizableTree[
P.first];
9188 if (Order.
empty() || !OpTE.ReuseShuffleIndices.empty()) {
9189 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
9191 const auto BestOrder =
9200 const unsigned E = Order.
size();
9203 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9205 Data.first->reorderSplitNode(
P.second ? 1 : 0, Mask, MaskOrder);
9207 if (!OpTE.ReorderIndices.empty()) {
9208 OpTE.ReorderIndices.clear();
9209 }
else if (!OpTE.ReuseShuffleIndices.empty()) {
9212 assert(OpTE.isGather() &&
"Expected only gather/buildvector node.");
9216 if (
Data.first->ReuseShuffleIndices.empty() &&
9217 !
Data.first->ReorderIndices.empty()) {
9220 Queue.push(
Data.first);
9226 buildReorderableOperands(
Data.first,
Data.second, NonVectorized,
9238 for (
const auto &
Op :
Data.second) {
9239 TreeEntry *OpTE =
Op.second;
9240 if (!VisitedOps.
insert(OpTE).second)
9242 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
9244 const auto Order = [&]() ->
const OrdersType {
9245 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
9249 return OpTE->ReorderIndices;
9253 if (Order.
size() == 1)
9259 Value *Root = OpTE->hasState()
9262 auto GetSameNodesUsers = [&](
Value *Root) {
9264 for (
const TreeEntry *TE : ValueToGatherNodes.lookup(Root)) {
9265 if (TE != OpTE && TE->UserTreeIndex &&
9266 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9267 TE->Scalars.size() == OpTE->Scalars.size() &&
9268 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9269 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9270 Res.
insert(TE->UserTreeIndex.UserTE);
9272 for (
const TreeEntry *TE : getTreeEntries(Root)) {
9273 if (TE != OpTE && TE->UserTreeIndex &&
9274 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9275 TE->Scalars.size() == OpTE->Scalars.size() &&
9276 ((TE->ReorderIndices.empty() && OpTE->isSame(TE->Scalars)) ||
9277 (OpTE->ReorderIndices.empty() && TE->isSame(OpTE->Scalars))))
9278 Res.
insert(TE->UserTreeIndex.UserTE);
9282 auto GetNumOperands = [](
const TreeEntry *TE) {
9283 if (TE->State == TreeEntry::SplitVectorize)
9284 return TE->getNumOperands();
9286 return CI->arg_size();
9287 return TE->getNumOperands();
9289 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9290 const TreeEntry *TE) {
9298 const TreeEntry *
Op = getOperandEntry(TE, Idx);
9299 if (
Op->isGather() &&
Op->hasState()) {
9300 const TreeEntry *VecOp =
9301 getSameValuesTreeEntry(
Op->getMainOp(),
Op->Scalars);
9305 if (
Op->ReorderIndices.empty() &&
Op->ReuseShuffleIndices.empty())
9312 if (!RevisitedOps.
insert(UTE).second)
9314 return UTE ==
Data.first || !UTE->ReorderIndices.empty() ||
9315 !UTE->ReuseShuffleIndices.empty() ||
9316 (UTE->UserTreeIndex &&
9317 UTE->UserTreeIndex.UserTE ==
Data.first) ||
9318 (
Data.first->UserTreeIndex &&
9319 Data.first->UserTreeIndex.UserTE == UTE) ||
9320 (IgnoreReorder && UTE->UserTreeIndex &&
9321 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9322 NodeShouldBeReorderedWithOperands(UTE);
9325 for (TreeEntry *UTE :
Users) {
9333 const TreeEntry *
Op = getOperandEntry(UTE, Idx);
9335 Queue.push(
const_cast<TreeEntry *
>(
Op));
9340 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
9341 return P.second == OpTE;
9344 if (OpTE->State == TreeEntry::Vectorize &&
9345 OpTE->getOpcode() == Instruction::Store && !Order.
empty()) {
9346 assert(!OpTE->isAltShuffle() &&
9347 "Alternate instructions are only supported by BinaryOperator "
9351 unsigned E = Order.
size();
9354 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9357 OrdersUses.try_emplace(CurrentOrder, 0).first->second +=
NumOps;
9359 OrdersUses.try_emplace(Order, 0).first->second +=
NumOps;
9361 auto Res = OrdersUses.try_emplace(
OrdersType(), 0);
9362 const auto AllowsReordering = [&](
const TreeEntry *TE) {
9363 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9364 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9365 (IgnoreReorder && TE->Idx == 0))
9367 if (TE->isGather()) {
9377 if (OpTE->UserTreeIndex) {
9378 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9379 if (!VisitedUsers.
insert(UserTE).second)
9384 if (AllowsReordering(UserTE))
9392 if (
static_cast<unsigned>(
count_if(
9393 Ops, [UserTE, &AllowsReordering](
9394 const std::pair<unsigned, TreeEntry *> &
Op) {
9395 return AllowsReordering(
Op.second) &&
9396 Op.second->UserTreeIndex.UserTE == UserTE;
9397 })) <=
Ops.size() / 2)
9398 ++Res.first->second;
9401 if (OrdersUses.empty()) {
9406 unsigned IdentityCnt = 0;
9407 unsigned VF =
Data.second.front().second->getVectorFactor();
9409 for (
auto &Pair : OrdersUses) {
9411 IdentityCnt += Pair.second;
9416 unsigned Cnt = IdentityCnt;
9417 for (
auto &Pair : OrdersUses) {
9421 if (Cnt < Pair.second) {
9423 BestOrder = Pair.first;
9440 unsigned E = BestOrder.
size();
9442 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9444 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
9445 TreeEntry *TE =
Op.second;
9446 if (!VisitedOps.
insert(TE).second)
9448 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
9449 reorderNodeWithReuses(*TE, Mask);
9453 if (TE->State != TreeEntry::Vectorize &&
9454 TE->State != TreeEntry::StridedVectorize &&
9455 TE->State != TreeEntry::CompressVectorize &&
9456 TE->State != TreeEntry::SplitVectorize &&
9457 (TE->State != TreeEntry::ScatterVectorize ||
9458 TE->ReorderIndices.empty()))
9460 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
9461 TE->ReorderIndices.empty()) &&
9462 "Non-matching sizes of user/operand entries.");
9464 if (IgnoreReorder && TE == VectorizableTree.front().get())
9465 IgnoreReorder =
false;
9468 for (TreeEntry *
Gather : GatherOps) {
9470 "Unexpected reordering of gathers.");
9471 if (!
Gather->ReuseShuffleIndices.empty()) {
9481 auto IsNotProfitableAltCodeNode = [](
const TreeEntry &TE) {
9482 return TE.isAltShuffle() &&
9483 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9484 TE.ReorderIndices.empty());
9486 if (
Data.first->State != TreeEntry::Vectorize ||
9488 Data.first->getMainOp()) ||
9489 IsNotProfitableAltCodeNode(*
Data.first))
9490 Data.first->reorderOperands(Mask);
9492 IsNotProfitableAltCodeNode(*
Data.first) ||
9493 Data.first->State == TreeEntry::CompressVectorize) {
9497 if (
Data.first->ReuseShuffleIndices.empty() &&
9498 !
Data.first->ReorderIndices.empty() &&
9499 !IsNotProfitableAltCodeNode(*
Data.first)) {
9502 Queue.push(
Data.first);
9510 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9511 VectorizableTree.front()->ReuseShuffleIndices.empty())
9512 VectorizableTree.front()->ReorderIndices.
clear();
9515Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
9516 if (Entry.hasState() &&
9517 (Entry.getOpcode() == Instruction::Store ||
9518 Entry.getOpcode() == Instruction::Load) &&
9519 Entry.State == TreeEntry::StridedVectorize &&
9520 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
9527 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9530 for (
auto &TEPtr : VectorizableTree) {
9531 TreeEntry *Entry = TEPtr.get();
9534 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9535 DeletedNodes.contains(Entry) ||
9536 TransformedToGatherNodes.contains(Entry))
9540 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9541 Value *Scalar = Entry->Scalars[Lane];
9546 auto It = ScalarToExtUses.
find(Scalar);
9547 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
9550 if (Scalar->hasNUsesOrMore(NumVectScalars)) {
9551 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9552 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract from lane " << FoundLane
9553 <<
" from " << *Scalar <<
"for many users.\n");
9554 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9555 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9556 ExternalUsesWithNonUsers.insert(Scalar);
9561 const auto ExtI = ExternallyUsedValues.
find(Scalar);
9562 if (ExtI != ExternallyUsedValues.
end()) {
9563 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9564 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
9565 << FoundLane <<
" from " << *Scalar <<
".\n");
9566 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
9567 ExternalUses.emplace_back(Scalar,
nullptr, *Entry, FoundLane);
9578 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
9583 any_of(UseEntries, [
this](
const TreeEntry *UseEntry) {
9584 return !DeletedNodes.contains(UseEntry) &&
9585 !TransformedToGatherNodes.contains(UseEntry);
9590 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9593 all_of(UseEntries, [&](TreeEntry *UseEntry) {
9594 if (DeletedNodes.contains(UseEntry) ||
9595 TransformedToGatherNodes.contains(UseEntry))
9597 return UseEntry->State == TreeEntry::ScatterVectorize ||
9599 Scalar, getRootEntryInstruction(*UseEntry), TLI,
9602 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
9605 [](TreeEntry *UseEntry) {
9606 return UseEntry->isGather();
9612 if (It != ScalarToExtUses.
end()) {
9613 ExternalUses[It->second].User =
nullptr;
9618 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
9620 unsigned FoundLane = Entry->findLaneForValue(Scalar);
9622 <<
" from lane " << FoundLane <<
" from " << *Scalar
9624 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
9625 ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
9626 ExternalUsesWithNonUsers.insert(Scalar);
9635BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
9639 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
9640 Value *V = TE->Scalars[Lane];
9653 if (
SI ==
nullptr || !
SI->isSimple() ||
SI->getFunction() !=
F ||
9662 auto &StoresVec = PtrToStoresMap[{
SI->getParent(),
9663 SI->getValueOperand()->getType(), Ptr}];
9666 if (StoresVec.size() > Lane)
9668 if (!StoresVec.empty()) {
9670 SI->getValueOperand()->getType(),
SI->getPointerOperand(),
9671 SI->getValueOperand()->getType(),
9672 StoresVec.front()->getPointerOperand(), *
DL, *SE,
9678 StoresVec.push_back(SI);
9683 for (
auto &
P : PtrToStoresMap) {
9698 StoreInst *S0 = StoresVec[0];
9703 StoreInst *
SI = StoresVec[Idx];
9704 std::optional<int64_t> Diff =
9706 SI->getPointerOperand(), *DL, *SE,
9712 if (StoreOffsetVec.
size() != StoresVec.
size())
9714 sort(StoreOffsetVec, llvm::less_first());
9716 int64_t PrevDist = 0;
9717 for (
const auto &
P : StoreOffsetVec) {
9718 if (Idx > 0 &&
P.first != PrevDist + 1)
9726 ReorderIndices.assign(StoresVec.
size(), 0);
9727 bool IsIdentity =
true;
9729 ReorderIndices[
P.second] =
I;
9730 IsIdentity &=
P.second ==
I;
9736 ReorderIndices.clear();
9743 for (
unsigned Idx : Order)
9744 dbgs() << Idx <<
", ";
9750BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
9751 unsigned NumLanes =
TE->Scalars.size();
9764 if (StoresVec.
size() != NumLanes)
9769 if (!canFormVector(StoresVec, ReorderIndices))
9774 ExternalReorderIndices.
push_back(ReorderIndices);
9776 return ExternalReorderIndices;
9782 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9783 "TreeEntryToStridedPtrInfoMap is not cleared");
9784 UserIgnoreList = &UserIgnoreLst;
9787 buildTreeRec(Roots, 0,
EdgeInfo());
9792 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9793 "TreeEntryToStridedPtrInfoMap is not cleared");
9796 buildTreeRec(Roots, 0,
EdgeInfo());
9805 bool AddNew =
true) {
9813 for (
Value *V : VL) {
9817 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
9819 bool IsFound =
false;
9820 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
9821 assert(LI->getParent() ==
Data.front().first->getParent() &&
9822 LI->getType() ==
Data.front().first->getType() &&
9826 "Expected loads with the same type, same parent and same "
9827 "underlying pointer.");
9829 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
9830 Data.front().first->getPointerOperand(),
DL, SE,
9834 auto It = Map.find(*Dist);
9835 if (It != Map.end() && It->second != LI)
9837 if (It == Map.end()) {
9838 Data.emplace_back(LI, *Dist);
9839 Map.try_emplace(*Dist, LI);
9849 auto FindMatchingLoads =
9854 int64_t &
Offset,
unsigned &Start) {
9856 return GatheredLoads.
end();
9865 std::optional<int64_t> Dist =
9867 Data.front().first->getType(),
9868 Data.front().first->getPointerOperand(),
DL, SE,
9874 for (std::pair<LoadInst *, int64_t>
P :
Data) {
9880 unsigned NumUniques = 0;
9881 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
9882 bool Used = DataLoads.
contains(Pair.first);
9883 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
9890 if (NumUniques > 0 &&
9891 (Loads.
size() == NumUniques ||
9892 (Loads.
size() - NumUniques >= 2 &&
9893 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
9899 return std::next(GatheredLoads.
begin(), Idx);
9903 return GatheredLoads.
end();
9905 for (
ArrayRef<std::pair<LoadInst *, int64_t>>
Data : ClusteredLoads) {
9909 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd,
Repeated,
9911 while (It != GatheredLoads.
end()) {
9912 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
9913 for (
unsigned Idx : LocalToAdd)
9920 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
9927 Loads.push_back(
Data[Idx]);
9933 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9934 return PD.front().first->getParent() == LI->
getParent() &&
9935 PD.front().first->getType() == LI->
getType();
9937 while (It != GatheredLoads.
end()) {
9940 std::next(It), GatheredLoads.
end(),
9941 [&](
ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9942 return PD.front().first->getParent() == LI->getParent() &&
9943 PD.front().first->getType() == LI->getType();
9947 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
9948 AddNewLoads(GatheredLoads.emplace_back());
9953void BoUpSLP::tryToVectorizeGatheredLoads(
9954 const SmallMapVector<
9955 std::tuple<BasicBlock *, Value *, Type *>,
9958 GatheredLoadsEntriesFirst = VectorizableTree.
size();
9961 LoadEntriesToVectorize.size());
9962 for (
auto [Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
9963 Set.insert_range(VectorizableTree[Idx]->Scalars);
9966 auto LoadSorter = [](
const std::pair<LoadInst *, int64_t> &L1,
9967 const std::pair<LoadInst *, int64_t> &L2) {
9968 return L1.second > L2.second;
9975 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
9976 return TTI->isLegalMaskedGather(Ty, Alignment) &&
9977 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
9982 SmallVectorImpl<LoadInst *> &NonVectorized,
9983 bool Final,
unsigned MaxVF) {
9985 unsigned StartIdx = 0;
9986 SmallVector<int> CandidateVFs;
9990 *TTI, Loads.
front()->getType(), MaxVF);
9992 *TTI, Loads.
front()->getType(), NumElts - 1)) {
9998 if (Final && CandidateVFs.
empty())
10001 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
10002 for (
unsigned NumElts : CandidateVFs) {
10003 if (Final && NumElts > BestVF)
10005 SmallVector<unsigned> MaskedGatherVectorized;
10006 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
10010 if (VectorizedLoads.count(Slice.
front()) ||
10011 VectorizedLoads.count(Slice.
back()) ||
10017 bool AllowToVectorize =
false;
10019 if (NumElts == 2) {
10020 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
10023 for (LoadInst *LI : Slice) {
10025 if (LI->hasOneUse())
10031 if (
static_cast<unsigned int>(std::distance(
10032 LI->user_begin(), LI->user_end())) != LI->getNumUses())
10034 if (!IsLegalBroadcastLoad)
10038 for (User *U : LI->users()) {
10041 for (
const TreeEntry *UTE : getTreeEntries(U)) {
10042 for (
int I :
seq<int>(UTE->getNumOperands())) {
10044 return V == LI || isa<PoisonValue>(V);
10054 AllowToVectorize = CheckIfAllowed(Slice);
10058 any_of(ValueToGatherNodes.at(Slice.front()),
10059 [=](
const TreeEntry *TE) {
10060 return TE->Scalars.size() == 2 &&
10061 ((TE->Scalars.front() == Slice.front() &&
10062 TE->Scalars.back() == Slice.back()) ||
10063 (TE->Scalars.front() == Slice.back() &&
10064 TE->Scalars.back() == Slice.front()));
10067 if (AllowToVectorize) {
10072 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
10075 PointerOps, SPtrInfo, &BestVF);
10077 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
10079 if (MaskedGatherVectorized.
empty() ||
10080 Cnt >= MaskedGatherVectorized.
back() + NumElts)
10085 Results.emplace_back(Values, LS);
10086 VectorizedLoads.insert_range(Slice);
10089 if (Cnt == StartIdx)
10090 StartIdx += NumElts;
10093 if (StartIdx >= Loads.
size())
10097 if (!MaskedGatherVectorized.
empty() &&
10098 Cnt < MaskedGatherVectorized.
back() + NumElts)
10099 MaskedGatherVectorized.
pop_back();
10100 Cnt += NumElts - 1;
10104 if (!AllowToVectorize || BestVF == 0)
10108 for (
unsigned Cnt : MaskedGatherVectorized) {
10110 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
10114 VectorizedLoads.insert_range(Slice);
10116 if (Cnt == StartIdx)
10117 StartIdx += NumElts;
10120 for (LoadInst *LI : Loads) {
10121 if (!VectorizedLoads.contains(LI))
10122 NonVectorized.push_back(LI);
10126 auto ProcessGatheredLoads =
10129 bool Final =
false) {
10131 for (
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
10133 if (LoadsDists.size() <= 1) {
10134 NonVectorized.
push_back(LoadsDists.back().first);
10142 unsigned MaxConsecutiveDistance = 0;
10143 unsigned CurrentConsecutiveDist = 1;
10144 int64_t LastDist = LocalLoadsDists.front().second;
10145 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
10146 for (
const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
10149 assert(LastDist >=
L.second &&
10150 "Expected first distance always not less than second");
10151 if (
static_cast<uint64_t
>(LastDist -
L.second) ==
10152 CurrentConsecutiveDist) {
10153 ++CurrentConsecutiveDist;
10154 MaxConsecutiveDistance =
10155 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
10159 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
10162 CurrentConsecutiveDist = 1;
10163 LastDist =
L.second;
10166 if (Loads.
size() <= 1)
10168 if (AllowMaskedGather)
10169 MaxConsecutiveDistance = Loads.
size();
10170 else if (MaxConsecutiveDistance < 2)
10175 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
10176 Final, MaxConsecutiveDistance);
10178 OriginalLoads.size() == Loads.
size() &&
10179 MaxConsecutiveDistance == Loads.
size() &&
10184 VectorizedLoads.
clear();
10188 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
10189 UnsortedNonVectorized, Final,
10190 OriginalLoads.size());
10191 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
10192 SortedNonVectorized.
swap(UnsortedNonVectorized);
10193 Results.swap(UnsortedResults);
10197 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize gathered loads ("
10198 << Slice.
size() <<
")\n");
10200 for (
Value *L : Slice)
10208 unsigned MaxVF = Slice.size();
10209 unsigned UserMaxVF = 0;
10210 unsigned InterleaveFactor = 0;
10215 std::optional<unsigned> InterleavedLoadsDistance = 0;
10216 unsigned Order = 0;
10217 std::optional<unsigned> CommonVF = 0;
10218 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
10219 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
10220 for (
auto [Idx, V] :
enumerate(Slice)) {
10221 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
10222 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
10225 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
10227 if (*CommonVF == 0) {
10228 CommonVF =
E->Scalars.size();
10231 if (*CommonVF !=
E->Scalars.size())
10235 if (Pos != Idx && InterleavedLoadsDistance) {
10238 if (isa<Constant>(V))
10240 if (isVectorized(V))
10242 const auto &Nodes = ValueToGatherNodes.at(V);
10243 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
10244 !is_contained(Slice, V);
10246 InterleavedLoadsDistance.reset();
10249 DeinterleavedNodes.
insert(
E);
10250 if (*InterleavedLoadsDistance == 0) {
10251 InterleavedLoadsDistance = Idx - Pos;
10254 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
10255 (Idx - Pos) / *InterleavedLoadsDistance < Order)
10256 InterleavedLoadsDistance.reset();
10257 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
10261 DeinterleavedNodes.
clear();
10263 if (InterleavedLoadsDistance.value_or(0) > 1 &&
10264 CommonVF.value_or(0) != 0) {
10265 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
10266 unsigned VF = *CommonVF;
10271 if (InterleaveFactor <= Slice.size() &&
10272 TTI.isLegalInterleavedAccessType(
10280 UserMaxVF = InterleaveFactor * VF;
10282 InterleaveFactor = 0;
10287 unsigned ConsecutiveNodesSize = 0;
10288 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10289 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10290 [&, Slice = Slice](
const auto &
P) {
10292 return std::get<1>(
P).contains(V);
10294 if (It == Slice.end())
10296 const TreeEntry &
TE =
10297 *VectorizableTree[std::get<0>(
P)];
10303 VL, VL.
front(), Order, PointerOps, SPtrInfo);
10307 ConsecutiveNodesSize += VL.
size();
10308 size_t Start = std::distance(Slice.begin(), It);
10309 size_t Sz = Slice.size() -
Start;
10310 return Sz < VL.
size() ||
10311 Slice.slice(Start, VL.
size()) != VL;
10316 if (InterleaveFactor == 0 &&
10318 [&, Slice = Slice](
unsigned Idx) {
10320 SmallVector<Value *> PointerOps;
10321 StridedPtrInfo SPtrInfo;
10322 return canVectorizeLoads(
10323 Slice.slice(Idx * UserMaxVF, UserMaxVF),
10324 Slice[Idx * UserMaxVF], Order, PointerOps,
10325 SPtrInfo) == LoadsState::ScatterVectorize;
10328 if (Slice.size() != ConsecutiveNodesSize)
10329 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
10331 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10332 bool IsVectorized =
true;
10333 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
10335 Slice.slice(
I, std::min(VF,
E -
I));
10340 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
10341 [&](
const auto &
P) {
10342 return !SubSlice.
equals(
10343 VectorizableTree[std::get<0>(
P)]
10348 unsigned Sz = VectorizableTree.size();
10349 buildTreeRec(SubSlice, 0,
EdgeInfo(), InterleaveFactor);
10350 if (Sz == VectorizableTree.size()) {
10351 IsVectorized =
false;
10354 if (InterleaveFactor > 0) {
10355 VF = 2 * (MaxVF / InterleaveFactor);
10356 InterleaveFactor = 0;
10365 NonVectorized.
append(SortedNonVectorized);
10367 return NonVectorized;
10369 for (
const auto &GLs : GatheredLoads) {
10370 const auto &
Ref = GLs.second;
10372 if (!
Ref.empty() && !NonVectorized.
empty() &&
10375 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10376 ->
unsigned {
return S + LoadsDists.size(); }) !=
10377 NonVectorized.
size() &&
10378 IsMaskedGatherSupported(NonVectorized)) {
10380 FinalGatheredLoads;
10381 for (LoadInst *LI : NonVectorized) {
10385 FinalGatheredLoads,
10389 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
10393 for (
unsigned Idx : LoadEntriesToVectorize) {
10394 const TreeEntry &
E = *VectorizableTree[Idx];
10397 if (!
E.ReorderIndices.empty()) {
10400 SmallVector<int> ReorderMask;
10404 buildTreeRec(GatheredScalars, 0,
EdgeInfo());
10408 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10409 VectorizableTree.size())
10410 GatheredLoadsEntriesFirst.reset();
10420 bool AllowAlternate) {
10426 if (LI->isSimple())
10437 SubKey =
hash_value(EI->getVectorOperand());
10444 if (AllowAlternate)
10455 std::pair<size_t, size_t> OpVals =
10463 if (CI->isCommutative())
10485 SubKey =
hash_value(Gep->getPointerOperand());
10497 return std::make_pair(
Key, SubKey);
10503 Instruction *AltOp,
const TargetLibraryInfo &TLI);
10509 const unsigned VF,
unsigned MinBW,
10532static std::pair<InstructionCost, InstructionCost>
10552 FMF = FPCI->getFastMathFlags();
10555 LibCost.isValid() ? LibCost : ScalarLimit);
10569 assert(L &&
"Expected valid loop");
10575 while (L && IsLoopInvariant(L, VL))
10576 L = L->getParentLoop();
10582 assert(L &&
"Expected valid loop");
10585 SmallVector<const Loop *> &Res =
10586 LoopToLoopNest.try_emplace(L).first->getSecond();
10589 SmallVector<const Loop *> LoopNest;
10592 L =
L->getParentLoop();
10598BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10600 bool IsScatterVectorizeUserTE,
OrdersType &CurrentOrder,
10601 SmallVectorImpl<Value *> &PointerOps,
StridedPtrInfo &SPtrInfo) {
10603 "Expected instructions with same/alternate opcodes only.");
10605 unsigned ShuffleOrOp =
10606 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10608 switch (ShuffleOrOp) {
10609 case Instruction::PHI: {
10612 return TreeEntry::NeedToGather;
10614 for (
Value *V : VL) {
10618 for (
Value *Incoming :
PHI->incoming_values()) {
10620 if (Term &&
Term->isTerminator()) {
10622 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
10623 return TreeEntry::NeedToGather;
10628 return TreeEntry::Vectorize;
10630 case Instruction::ExtractElement:
10637 return TreeEntry::NeedToGather;
10639 case Instruction::ExtractValue: {
10640 bool Reuse = canReuseExtract(VL, CurrentOrder);
10641 if (Reuse || !CurrentOrder.empty())
10642 return TreeEntry::Vectorize;
10644 return TreeEntry::NeedToGather;
10646 case Instruction::InsertElement: {
10650 for (
Value *V : VL) {
10652 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement/poison vector.\n");
10653 return TreeEntry::NeedToGather;
10657 "Non-constant or undef index?");
10661 return !SourceVectors.contains(V);
10664 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10665 "different source vectors.\n");
10666 return TreeEntry::NeedToGather;
10671 return SourceVectors.contains(V) && !
V->hasOneUse();
10674 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
10675 "multiple uses.\n");
10676 return TreeEntry::NeedToGather;
10679 return TreeEntry::Vectorize;
10681 case Instruction::Load: {
10688 auto IsGatheredNode = [&]() {
10689 if (!GatheredLoadsEntriesFirst)
10694 return any_of(getTreeEntries(V), [&](
const TreeEntry *TE) {
10695 return TE->Idx >= *GatheredLoadsEntriesFirst;
10701 return TreeEntry::Vectorize;
10703 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10705 LoadEntriesToVectorize.insert(VectorizableTree.size());
10706 return TreeEntry::NeedToGather;
10708 return IsGatheredNode() ? TreeEntry::NeedToGather
10709 : TreeEntry::CompressVectorize;
10711 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10713 LoadEntriesToVectorize.insert(VectorizableTree.size());
10714 return TreeEntry::NeedToGather;
10716 return IsGatheredNode() ? TreeEntry::NeedToGather
10717 : TreeEntry::ScatterVectorize;
10719 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10721 LoadEntriesToVectorize.insert(VectorizableTree.size());
10722 return TreeEntry::NeedToGather;
10724 return IsGatheredNode() ? TreeEntry::NeedToGather
10725 : TreeEntry::StridedVectorize;
10729 if (DL->getTypeSizeInBits(ScalarTy) !=
10730 DL->getTypeAllocSizeInBits(ScalarTy))
10731 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
10734 return !LI || !LI->isSimple();
10738 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering non-consecutive loads.\n");
10741 return TreeEntry::NeedToGather;
10745 case Instruction::ZExt:
10746 case Instruction::SExt:
10747 case Instruction::FPToUI:
10748 case Instruction::FPToSI:
10749 case Instruction::FPExt:
10750 case Instruction::PtrToInt:
10751 case Instruction::IntToPtr:
10752 case Instruction::SIToFP:
10753 case Instruction::UIToFP:
10754 case Instruction::Trunc:
10755 case Instruction::FPTrunc:
10756 case Instruction::BitCast: {
10758 for (
Value *V : VL) {
10764 dbgs() <<
"SLP: Gathering casts with different src types.\n");
10765 return TreeEntry::NeedToGather;
10768 return TreeEntry::Vectorize;
10770 case Instruction::ICmp:
10771 case Instruction::FCmp: {
10776 for (
Value *V : VL) {
10780 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
10781 Cmp->getOperand(0)->getType() != ComparedTy) {
10782 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
10783 return TreeEntry::NeedToGather;
10786 return TreeEntry::Vectorize;
10788 case Instruction::Select:
10790 SmallPtrSet<Type *, 4> CondTypes;
10791 for (
Value *V : VL) {
10798 if (CondTypes.
size() > 1) {
10801 <<
"SLP: Gathering select with different condition types.\n");
10802 return TreeEntry::NeedToGather;
10806 case Instruction::FNeg:
10807 case Instruction::Add:
10808 case Instruction::FAdd:
10809 case Instruction::Sub:
10810 case Instruction::FSub:
10811 case Instruction::Mul:
10812 case Instruction::FMul:
10813 case Instruction::UDiv:
10814 case Instruction::SDiv:
10815 case Instruction::FDiv:
10816 case Instruction::URem:
10817 case Instruction::SRem:
10818 case Instruction::FRem:
10819 case Instruction::Shl:
10820 case Instruction::LShr:
10821 case Instruction::AShr:
10822 case Instruction::And:
10823 case Instruction::Or:
10824 case Instruction::Xor:
10825 case Instruction::Freeze:
10826 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10827 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10829 return I &&
I->isBinaryOp() && !
I->isFast();
10831 return TreeEntry::NeedToGather;
10832 return TreeEntry::Vectorize;
10833 case Instruction::GetElementPtr: {
10835 for (
Value *V : VL) {
10839 if (
I->getNumOperands() != 2) {
10840 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
10841 return TreeEntry::NeedToGather;
10848 for (
Value *V : VL) {
10852 Type *CurTy =
GEP->getSourceElementType();
10853 if (Ty0 != CurTy) {
10854 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
10855 return TreeEntry::NeedToGather;
10861 for (
Value *V : VL) {
10865 auto *
Op =
I->getOperand(1);
10867 (
Op->getType() != Ty1 &&
10869 Op->getType()->getScalarSizeInBits() >
10870 DL->getIndexSizeInBits(
10871 V->getType()->getPointerAddressSpace())))) {
10873 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
10874 return TreeEntry::NeedToGather;
10878 return TreeEntry::Vectorize;
10880 case Instruction::Store: {
10882 llvm::Type *ScalarTy =
cast<StoreInst>(VL0)->getValueOperand()->getType();
10885 if (DL->getTypeSizeInBits(ScalarTy) !=
10886 DL->getTypeAllocSizeInBits(ScalarTy)) {
10887 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
10888 return TreeEntry::NeedToGather;
10892 for (
Value *V : VL) {
10894 if (!
SI->isSimple()) {
10896 return TreeEntry::NeedToGather;
10905 if (CurrentOrder.empty()) {
10906 Ptr0 = PointerOps.
front();
10907 PtrN = PointerOps.
back();
10909 Ptr0 = PointerOps[CurrentOrder.front()];
10910 PtrN = PointerOps[CurrentOrder.back()];
10913 std::optional<int64_t> Dist =
10916 if (
static_cast<uint64_t
>(*Dist) == VL.size() - 1)
10917 return TreeEntry::Vectorize;
10920 CurrentOrder, *Dist, Ptr0, SPtrInfo))
10921 return TreeEntry::StridedVectorize;
10925 return TreeEntry::NeedToGather;
10927 case Instruction::Call: {
10928 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10929 TTI->isFPVectorizationPotentiallyUnsafe() &&
any_of(VL, [](
Value *V) {
10931 return I && !
I->isFast();
10933 return TreeEntry::NeedToGather;
10943 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10947 return TreeEntry::NeedToGather;
10950 unsigned NumArgs = CI->
arg_size();
10951 SmallVector<Value *, 4> ScalarArgs(NumArgs,
nullptr);
10952 for (
unsigned J = 0; J != NumArgs; ++J)
10955 for (
Value *V : VL) {
10960 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10962 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
10964 return TreeEntry::NeedToGather;
10968 for (
unsigned J = 0; J != NumArgs; ++J) {
10971 if (ScalarArgs[J] != A1J) {
10973 <<
"SLP: mismatched arguments in call:" << *CI
10974 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
10975 return TreeEntry::NeedToGather;
10984 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
10985 <<
"!=" << *V <<
'\n');
10986 return TreeEntry::NeedToGather;
10991 auto *VecTy =
getWidenedType(S.getMainOp()->getType(), VL.size());
10993 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10994 return TreeEntry::NeedToGather;
10996 return TreeEntry::Vectorize;
10998 case Instruction::ShuffleVector: {
10999 if (!S.isAltShuffle()) {
11002 return TreeEntry::Vectorize;
11005 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
11006 return TreeEntry::NeedToGather;
11009 return TreeEntry::Vectorize;
11013 return TreeEntry::NeedToGather;
11022 PHINode *Main =
nullptr;
11027 PHIHandler() =
delete;
11029 : DT(DT), Main(Main), Phis(Phis),
11030 Operands(Main->getNumIncomingValues(),
11032 void buildOperands() {
11033 constexpr unsigned FastLimit = 4;
11042 for (
auto [Idx, V] :
enumerate(Phis)) {
11046 "Expected isa instruction or poison value.");
11047 Operands[
I][Idx] =
V;
11050 if (
P->getIncomingBlock(
I) == InBB)
11051 Operands[
I][Idx] =
P->getIncomingValue(
I);
11053 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
11058 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
11068 for (
auto [Idx, V] :
enumerate(Phis)) {
11071 Operands[
I][Idx] =
V;
11080 Operands[
I][Idx] =
P->getIncomingValue(
I);
11083 auto *It = Blocks.
find(InBB);
11084 if (It == Blocks.
end())
11086 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
11089 for (
const auto &
P : Blocks) {
11090 ArrayRef<unsigned> IncomingValues =
P.second;
11091 if (IncomingValues.
size() <= 1)
11094 for (
unsigned I : IncomingValues) {
11096 [&](
const auto &
Data) {
11097 return !
Data.value() ||
11098 Data.value() == Operands[BasicI][
Data.index()];
11100 "Expected empty operands list.");
11101 Operands[
I] = Operands[BasicI];
11114static std::pair<Instruction *, Instruction *>
11118 for (
Value *V : VL) {
11128 if (MainOp->
getOpcode() ==
I->getOpcode()) {
11147 "Expected different main and alt instructions.");
11148 return std::make_pair(MainOp, AltOp);
11161 const InstructionsState &S,
11163 const BoUpSLP &R,
bool BuildGatherOnly =
true) {
11167 for (
Value *V : VL) {
11184 bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
11186 (S.areInstructionsWithCopyableElements() ||
11191 bool AreAllValuesNonConst = UniquePositions.
size() == UniqueValues.
size();
11193 if (RequireScheduling) {
11197 assert(EndIt != UniqueValues.
rend() &&
"Expected at least one non-poison.");
11198 UniqueValues.
erase(EndIt.base(), UniqueValues.
end());
11200 unsigned NumUniqueScalarValues = UniqueValues.
size();
11201 if (NumUniqueScalarValues == VL.
size()) {
11202 ReuseShuffleIndices.
clear();
11210 constexpr unsigned SmallVecWidth = 4;
11211 constexpr unsigned SmallVecUniqueThreshold = 3;
11212 if (VL.
size() == SmallVecWidth &&
11213 NumUniqueScalarValues == SmallVecUniqueThreshold && !BuildGatherOnly &&
11214 !(S && (S.getOpcode() == Instruction::Load ||
11215 S.getOpcode() == Instruction::PHI))) {
11217 ReuseShuffleIndices.
clear();
11223 auto EstimatePackPlusShuffleVsInserts = [&]() {
11225 if (UniquePositions.
size() == 1 &&
11226 (NumUniqueScalarValues == 1 ||
11228 return std::make_pair(
false,
false);
11232 constexpr unsigned MinVLForConstGatherCheck = 4;
11233 if (BuildGatherOnly && VL.
size() > MinVLForConstGatherCheck &&
11236 UniquePositions.
size() * 2 < NumUniqueScalarValues)
11237 return std::make_pair(
false,
false);
11239 assert(S && S.getOpcode() == Instruction::Load &&
"Expected load.");
11244 PointerOps, SPtrInfo);
11258 bool IsRootOperand =
11259 UserTreeIdx.
UserTE && UserTreeIdx.
UserTE->Idx == 0 && !BuildGatherOnly;
11260 if (IsRootOperand) {
11261 if (S && S.getOpcode() == Instruction::Load) {
11262 bool UseOrig = (CheckLoads(UniqueValues,
true) &&
11263 CheckLoads(VL,
false)) ||
11265 ReuseShuffleIndices, ReuseShuffleIndices.
size());
11266 return std::make_pair(
true, UseOrig);
11268 return std::make_pair(
true, !RequireScheduling);
11271 for (
auto [Idx, Val] :
enumerate(ReuseShuffleIndices))
11273 DemandedElts.
setBit(Idx);
11276 auto *UniquesVecTy =
getWidenedType(ScalarTy, NumUniqueScalarValues);
11278 const unsigned UniquesNumParts =
11282 if (!RequireScheduling) {
11283 if (VL.
size() / NumUniqueScalarValues == 1 &&
11284 (NumParts <= 1 || UniquesNumParts >= NumParts))
11285 return std::make_pair(
true,
true);
11288 if (S && S.getOpcode() == Instruction::PHI && NumUniqueScalarValues > 1 &&
11289 UniquesNumParts <= NumParts)
11290 return std::make_pair(
true,
false);
11305 if (S && !BuildGatherOnly) {
11306 bool HasOneDup = S.getOpcode() != Instruction::Load &&
11307 NumUniqueScalarValues + 1 == VL.
size();
11308 bool MostlyUnique = NumUniqueScalarValues * 2 > VL.
size();
11309 bool IsHalfUniqueValues =
11310 NumUniqueScalarValues * 2 == VL.
size() &&
11311 (S.getOpcode() == Instruction::GetElementPtr ||
11314 NumParts * (VL.
size() > SmallVecWidth ? 1 : 2);
11316 ((MostlyUnique || IsHalfUniqueValues) && ReusesCost >
CostThreshold))
11317 return std::make_pair(
true,
true);
11322 if (S && S.getOpcode() == Instruction::Load) {
11323 bool UniquesVectorized =
11324 CheckLoads(UniqueValues,
false);
11325 if (UniquesVectorized || CheckLoads(VL,
false))
11326 return std::make_pair(
true, !UniquesVectorized);
11328 bool CanSkipBVCost =
11329 (!BuildGatherOnly && !RequireScheduling) || R.hasSameNode(S, VL);
11335 CostKind, AreAllValuesNonConst, VL);
11337 for (
const auto [Idx, V] :
enumerate(UniqueValues))
11339 UniquesDemandedElts.
clearBit(Idx);
11344 UniquesDemandedElts,
true,
11346 AreAllValuesNonConst, UniqueValues);
11347 UniquesCost += ReusesCost;
11348 if (UniquesCost <= InsertsCost)
11349 return std::make_pair(
true,
false);
11352 (R.getTreeSize() == 0 && R.isReductionTree() &&
11354 return std::make_pair(S && (!S.isAltShuffle() || !BuildGatherOnly),
11358 bool KeepOriginal = !BuildGatherOnly && !RequireScheduling;
11359 return std::make_pair(KeepOriginal, KeepOriginal);
11362 const auto [PackProfitable, UseOriginal] = EstimatePackPlusShuffleVsInserts();
11364 if (PackProfitable) {
11367 ReuseShuffleIndices.
clear();
11371 VL = std::move(UniqueValues);
11378 ReuseShuffleIndices.
clear();
11383 const InstructionsState &LocalState,
11387 constexpr unsigned SmallNodeSize = 4;
11388 if (VL.
size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11393 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *LocalState.getMainOp()
11395 for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
11396 if (E->isSame(VL)) {
11398 << *LocalState.getMainOp() <<
".\n");
11416 Op1Indices.
set(Idx);
11419 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11422 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11424 LocalState.getAltOp(), *TLI))) {
11426 Op1Indices.
set(Idx);
11433 unsigned Opcode0 = LocalState.getOpcode();
11434 unsigned Opcode1 = LocalState.getAltOpcode();
11440 if (UOp1.
size() <= 1 || UOp2.
size() <= 1 ||
11441 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
11443 unsigned Op1Cnt = 0, Op2Cnt = Op1.
size();
11445 if (Op1Indices.
test(Idx)) {
11446 ReorderIndices[Op1Cnt] = Idx;
11449 ReorderIndices[Op2Cnt] = Idx;
11454 ReorderIndices.
clear();
11464 if (!ReorderIndices.
empty())
11466 unsigned NumParts = TTI->getNumberOfParts(VecTy);
11471 if (NumParts >= VL.
size())
11480 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11481 (Mask.empty() || InsertCost >= NewShuffleCost))
11483 if ((LocalState.getMainOp()->isBinaryOp() &&
11484 LocalState.getAltOp()->isBinaryOp() &&
11485 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11486 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11487 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11488 (LocalState.getMainOp()->isUnaryOp() &&
11489 LocalState.getAltOp()->isUnaryOp())) {
11491 TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
11492 TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
11497 OriginalMask[Idx] = Idx + (Op1Indices.
test(Idx) ? 0 : VL.
size());
11501 VecTy, OriginalMask, Kind);
11503 TTI->getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
11504 TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
11506 NewVecOpsCost + InsertCost +
11507 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11508 VectorizableTree.front()->getOpcode() == Instruction::Store
11512 if (NewCost >= OriginalCost)
11522class InstructionsCompatibilityAnalysis {
11527 unsigned MainOpcode = 0;
11532 static bool isSupportedOpcode(
const unsigned Opcode) {
11533 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11534 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11535 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11536 Opcode == Instruction::And || Opcode == Instruction::Or ||
11537 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11538 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11539 Opcode == Instruction::FDiv;
11549 auto IsSupportedInstruction = [&](
Instruction *
I,
bool AnyUndef) {
11550 if (AnyUndef && (
I->isIntDivRem() ||
I->isFPDivRem() ||
isa<CallInst>(
I)))
11552 return I && isSupportedOpcode(
I->getOpcode()) &&
11557 SmallDenseSet<Value *, 8> Operands;
11558 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11559 bool AnyUndef =
false;
11560 for (
Value *V : VL) {
11568 if (Candidates.
empty()) {
11569 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11571 Operands.
insert(
I->op_begin(),
I->op_end());
11574 if (Parent ==
I->getParent()) {
11575 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11576 Operands.
insert(
I->op_begin(),
I->op_end());
11579 auto *NodeA = DT.
getNode(Parent);
11580 auto *NodeB = DT.
getNode(
I->getParent());
11581 assert(NodeA &&
"Should only process reachable instructions");
11582 assert(NodeB &&
"Should only process reachable instructions");
11583 assert((NodeA == NodeB) ==
11584 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11585 "Different nodes should have different DFS numbers");
11586 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11587 Candidates.
clear();
11588 Candidates.
try_emplace(
I->getOpcode()).first->second.push_back(
I);
11591 Operands.
insert(
I->op_begin(),
I->op_end());
11594 unsigned BestOpcodeNum = 0;
11596 bool UsedOutside =
false;
11597 for (
const auto &
P : Candidates) {
11599 if (UsedOutside && !PUsedOutside)
11601 if (!UsedOutside && PUsedOutside)
11603 if (
P.second.size() < BestOpcodeNum)
11606 if (!PUsedOutside &&
any_of(
P.second, [&](Instruction *
I) {
11607 return Operands.contains(I);
11610 UsedOutside = PUsedOutside;
11611 for (Instruction *
I :
P.second) {
11612 if (IsSupportedInstruction(
I, AnyUndef)) {
11614 BestOpcodeNum =
P.second.size();
11624 return I &&
I->getParent() == MainOp->
getParent() &&
11637 Value *selectBestIdempotentValue()
const {
11638 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11649 if (!S.isCopyableElement(V))
11651 assert(isSupportedOpcode(MainOpcode) &&
"Unsupported opcode");
11652 return {
V, selectBestIdempotentValue()};
11658 SmallVectorImpl<BoUpSLP::ValueList> &Operands)
const {
11660 unsigned ShuffleOrOp =
11661 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11664 switch (ShuffleOrOp) {
11665 case Instruction::PHI: {
11669 PHIHandler Handler(DT, PH, VL);
11670 Handler.buildOperands();
11671 Operands.
assign(PH->getNumOperands(), {});
11673 Operands[
I].
assign(Handler.getOperands(
I).begin(),
11674 Handler.getOperands(
I).end());
11677 case Instruction::ExtractValue:
11678 case Instruction::ExtractElement:
11683 case Instruction::InsertElement:
11691 case Instruction::Load:
11695 for (
auto [V,
Op] :
zip(VL, Operands.
back())) {
11699 Op = LI->getPointerOperand();
11702 case Instruction::ZExt:
11703 case Instruction::SExt:
11704 case Instruction::FPToUI:
11705 case Instruction::FPToSI:
11706 case Instruction::FPExt:
11707 case Instruction::PtrToInt:
11708 case Instruction::IntToPtr:
11709 case Instruction::SIToFP:
11710 case Instruction::UIToFP:
11711 case Instruction::Trunc:
11712 case Instruction::FPTrunc:
11713 case Instruction::BitCast:
11714 case Instruction::ICmp:
11715 case Instruction::FCmp:
11716 case Instruction::FNeg:
11717 case Instruction::Add:
11718 case Instruction::FAdd:
11719 case Instruction::Sub:
11720 case Instruction::FSub:
11721 case Instruction::Mul:
11722 case Instruction::FMul:
11723 case Instruction::UDiv:
11724 case Instruction::SDiv:
11725 case Instruction::FDiv:
11726 case Instruction::URem:
11727 case Instruction::SRem:
11728 case Instruction::FRem:
11729 case Instruction::Shl:
11730 case Instruction::LShr:
11731 case Instruction::AShr:
11732 case Instruction::And:
11733 case Instruction::Or:
11734 case Instruction::Xor:
11735 case Instruction::Freeze:
11736 case Instruction::Store:
11737 case Instruction::ShuffleVector:
11746 auto [
Op, ConvertedOps] = convertTo(
I, S);
11751 case Instruction::Select:
11765 Operands[0][Idx] =
I->getOperand(0);
11766 Operands[1][Idx] = ConstantInt::get(
I->getType(), 1);
11767 Operands[2][Idx] = ConstantInt::getNullValue(
I->getType());
11770 auto [
Op, ConvertedOps] = convertTo(
I, S);
11775 case Instruction::GetElementPtr: {
11782 const unsigned IndexIdx = 1;
11788 return !
GEP || VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
11792 ->getPointerOperandType()
11793 ->getScalarType());
11797 Operands[0][Idx] =
V;
11798 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11801 Operands[0][Idx] =
GEP->getPointerOperand();
11802 auto *
Op =
GEP->getOperand(IndexIdx);
11805 CI, Ty, CI->getValue().isSignBitSet(),
DL)
11810 case Instruction::Call: {
11817 for (
Value *V : VL) {
11819 Ops.push_back(
I ?
I->getOperand(Idx)
11838 const InstructionsState &S,
11839 const InstructionsState &CopyableS) {
11846 Instruction *SAlt = S.isAltShuffle() ? S.getAltOp() :
nullptr;
11848 const bool IsAltCommutative =
11852 buildOriginalOperands(S, SMain,
Ops);
11854 if (
Ops.size() != 2)
11866 auto *I = dyn_cast<Instruction>(V);
11867 return I && I->getOpcode() == SMainOpI->getOpcode();
11870 SmallPtrSet<Value *, 8> Operands;
11871 for (
Value *V : VL) {
11873 if (!
I ||
I == SMain)
11875 Instruction *MatchingOp = S.getMatchingMainOpOrAltOp(
I);
11876 if (MatchingOp != SMain)
11879 buildOriginalOperands(S,
I, VOps);
11880 Operands.
insert(
I->op_begin(),
I->op_end());
11882 "Expected binary operations only.");
11883 if (CheckOperands(VOps[0][0],
Ops[0][0]) ||
11884 CheckOperands(VOps[1][0],
Ops[1][0]) ||
11885 (IsCommutative && (CheckOperands(VOps[0][0],
Ops[1][0]) ||
11886 CheckOperands(VOps[1][0],
Ops[0][0])))) {
11893 buildOriginalOperands(S, MainOp, MainOps);
11895 auto BuildFirstOperandCandidates =
11896 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11898 bool IsCommutative) {
11904 auto BuildSecondOperandCandidates =
11905 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11907 Value *Op1,
bool IsCommutative) {
11908 if (PrevBestIdx != 1)
11910 if (PrevBestIdx != 0 && IsCommutative)
11914 auto FindBestCandidate =
11917 auto Res =
R.findBestRootPair(Candidates);
11918 Score = Res.second;
11921 isConstant(Candidates[Res.first.value_or(0)].first) &&
11922 isConstant(Candidates[Res.first.value_or(0)].second);
11926 for (
const auto [Idx,
P] :
enumerate(Candidates)) {
11928 P.second ==
P.first) {
11931 Score =
isa<LoadInst>(Candidates[Res.first.value_or(0)].first)
11941 for (
Value *V : VL) {
11943 if (!
I || (
I == MainOp && (!S.isAltShuffle() ||
I == SMain)) ||
11944 (!S.isAltShuffle() &&
I == SMain))
11947 buildOriginalOperands(S,
I == SMain ? MainOp :
I, VOps);
11949 getOperands(CopyableS,
I == MainOp ? SMain :
I);
11950 if (CopyableOps.
size() == VOps.
size() &&
11951 all_of(
zip(CopyableOps, VOps), [&](
const auto &
P) {
11952 return std::get<0>(
P) == std::get<1>(
P)[0];
11956 BuildFirstOperandCandidates(Candidates, MainOps, CopyableOps[0],
11957 CopyableOps[1], IsMainCommutative);
11958 const unsigned OpSize = Candidates.
size();
11960 S.getMatchingMainOpOrAltOp(
I) == S.getMainOp() ? SMain : SAlt;
11961 const bool IsCommutativeInst =
11962 (MatchingOp == SMain ? IsCommutative : IsAltCommutative) ||
11964 if (S.isAltShuffle() && MatchingOp == SAlt &&
11970 if (S.isAltShuffle() && MatchingOp == SMain)
11971 Operands.
insert(
I->op_begin(),
I->op_end());
11972 BuildFirstOperandCandidates(Candidates,
Ops, VOps[0][0], VOps[1][0],
11973 IsCommutativeInst);
11976 std::optional<int> BestOp =
11977 FindBestCandidate(Candidates, IsBestConst, Score);
11978 const bool IsOriginalBetter =
11979 static_cast<unsigned>(BestOp.value_or(OpSize)) >= OpSize;
11980 Candidates.
clear();
11981 BuildSecondOperandCandidates(
11982 Candidates, MainOps, IsOriginalBetter ? -1 : *BestOp, CopyableOps[0],
11983 CopyableOps[1], IsMainCommutative);
11984 const unsigned SecondOpSize = Candidates.
size();
11985 BuildSecondOperandCandidates(
11987 IsOriginalBetter ? BestOp.value_or(OpSize - 1) - OpSize : -1,
11988 VOps[0][0], VOps[1][0], IsCommutativeInst);
11989 bool IsSecondBestConst;
11991 std::optional<int> SecondBestOp =
11992 FindBestCandidate(Candidates, IsSecondBestConst, SecondScore);
11994 if (!BestOp && !SecondBestOp)
11997 const bool IsSecondOriginalBetter =
11998 static_cast<unsigned>(SecondBestOp.value_or(SecondOpSize)) >=
12000 if (IsOriginalBetter && IsSecondOriginalBetter)
12004 if (!BestOp && IsSecondOriginalBetter)
12008 if (!SecondBestOp && IsOriginalBetter)
12012 if (!IsOriginalBetter && IsBestConst && IsSecondOriginalBetter &&
12013 !IsSecondBestConst)
12017 if (BestOp && IsOriginalBetter && !IsBestConst &&
12018 !IsSecondOriginalBetter && IsSecondBestConst)
12021 if (((Score > SecondScore ||
12023 Score == SecondScore)) &&
12024 IsOriginalBetter) ||
12025 (IsSecondOriginalBetter &&
12026 (SecondScore > Score ||
12028 Score == SecondScore))))
12035 InstructionsCompatibilityAnalysis(DominatorTree &DT,
const DataLayout &
DL,
12036 const TargetTransformInfo &
TTI,
12037 const TargetLibraryInfo &TLI)
12042 bool WithProfitabilityCheck =
false,
12043 bool SkipSameCodeCheck =
false) {
12044 InstructionsState S = (SkipSameCodeCheck || !
allSameBlock(VL))
12045 ? InstructionsState::invalid()
12057 return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) ||
12061 return InstructionsState(SelectOp, SelectOp);
12063 if (S && S.isAltShuffle()) {
12064 Type *ScalarTy = S.getMainOp()->getType();
12066 unsigned Opcode0 = S.getOpcode();
12067 unsigned Opcode1 = S.getAltOpcode();
12068 SmallBitVector OpcodeMask(
12077 return !
I ||
I->getOpcode() == S.getOpcode() ||
12078 (S.getOpcode() == Instruction::Add &&
12079 I->getOpcode() == Instruction::Shl);
12085 findAndSetMainInstruction(VL, R);
12088 InstructionsState OrigS = S;
12089 S = InstructionsState(MainOp, MainOp,
true);
12090 if (OrigS && !isCopyablePreferable(VL, R, OrigS, S))
12092 if (!WithProfitabilityCheck)
12096 auto BuildCandidates =
12097 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
Value *V1,
12103 if (I1 && I2 &&
I1->getOpcode() == I2->getOpcode() &&
12104 I1->getParent() != I2->getParent())
12108 if (VL.
size() == 2) {
12111 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
12112 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
12113 bool Res = !Candidates1.
empty() && !Candidates2.
empty() &&
12114 R.findBestRootPair(Candidates1).first &&
12115 R.findBestRootPair(Candidates2).first;
12117 Candidates1.
clear();
12118 Candidates2.
clear();
12119 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
12120 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
12121 Res = !Candidates1.
empty() && !Candidates2.
empty() &&
12122 R.findBestRootPair(Candidates1).first &&
12123 R.findBestRootPair(Candidates2).first;
12130 FixedVectorType *VecTy =
12132 switch (MainOpcode) {
12133 case Instruction::Add:
12134 case Instruction::Sub:
12135 case Instruction::LShr:
12136 case Instruction::Shl:
12137 case Instruction::SDiv:
12138 case Instruction::UDiv:
12139 case Instruction::And:
12140 case Instruction::Or:
12141 case Instruction::Xor:
12142 case Instruction::FAdd:
12143 case Instruction::FMul:
12144 case Instruction::FSub:
12145 case Instruction::FDiv:
12151 if (VectorCost > ScalarCost)
12155 assert(Operands.
size() == 2 &&
"Unexpected number of operands!");
12156 unsigned CopyableNum =
12157 count_if(VL, [&](
Value *V) {
return S.isCopyableElement(V); });
12158 if (CopyableNum < VL.
size() / 2)
12161 const unsigned Limit = VL.
size() / 24;
12162 if ((CopyableNum >= VL.
size() - Limit ||
12163 (CopyableNum >= VL.
size() - 1 && VL.
size() > 4) ||
12172 Value *BestFrontOp =
nullptr;
12173 for (
auto [OpL, OpR] :
zip(Operands.
front(), Operands.
back())) {
12194 const unsigned BestOpcode = BestLHS->getOpcode();
12195 for (
auto [OpL, OpR] :
zip(Operands.
front(), Operands.
back())) {
12199 if (OpRI->getOpcode() == BestOpcode)
12212 constexpr unsigned Limit = 4;
12213 if (Operands.
front().size() >= Limit) {
12214 SmallDenseMap<const Value *, unsigned>
Counters;
12222 return C.second == 1;
12228 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
12229 InstructionsState OpS =
Analysis.buildInstructionsState(
Ops, R);
12230 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !
allSameBlock(
Ops)))
12232 unsigned CopyableNum =
12234 return CopyableNum <= VL.
size() / 2;
12236 if (!CheckOperand(Operands.
front()))
12244 assert(S &&
"Invalid state!");
12246 if (S.areInstructionsWithCopyableElements()) {
12247 MainOp = S.getMainOp();
12248 MainOpcode = S.getOpcode();
12249 const bool IsCommutative =
12256 for (
auto [OperandIdx, Operand] :
enumerate(OperandsForValue))
12257 Operands[OperandIdx][Idx] = Operand;
12263 if (IsCommutative) {
12269 unsigned FwdCount = 0;
12270 unsigned RevCount = 0;
12272 SmallMapVector<std::pair<unsigned, unsigned>, PairInfo, 8> PairCounts;
12273 unsigned MajID0 = 0, MajID1 = 0;
12277 unsigned ID0 = Operands[0][Idx]->getValueID();
12278 unsigned ID1 = Operands[1][Idx]->getValueID();
12281 unsigned MinID = std::min(ID0, ID1);
12282 unsigned MaxID = std::max(ID0, ID1);
12285 PairInfo &
Info = It->second;
12294 unsigned BestCount = 0;
12295 for (
const auto &
P : PairCounts) {
12296 const PairInfo &
Info =
P.second;
12298 if (
Total > BestCount) {
12300 if (
Info.FwdCount >=
Info.RevCount) {
12301 MajID0 =
P.first.first;
12302 MajID1 =
P.first.second;
12304 MajID0 =
P.first.second;
12305 MajID1 =
P.first.first;
12316 unsigned LAt0 = 0, LAt1 = 0, TotalNC = 0;
12321 if (BestCount > 0) {
12322 unsigned ID0 = Operands[0][Idx]->getValueID();
12323 unsigned ID1 = Operands[1][Idx]->getValueID();
12324 if (ID0 == MajID1 && ID1 == MajID0)
12325 std::swap(Operands[0][Idx], Operands[1][Idx]);
12333 if (TotalNC > 1 && LAt1 > LAt0 && LAt1 * 2 > TotalNC) {
12339 std::swap(Operands[0][Idx], Operands[1][Idx]);
12344 buildOriginalOperands(S, VL, Operands);
12351BoUpSLP::ScalarsVectorizationLegality
12353 const EdgeInfo &UserTreeIdx)
const {
12356 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
12357 InstructionsState S =
Analysis.buildInstructionsState(
12360 bool AreScatterAllGEPSameBlock =
false;
12362 SmallVector<unsigned> SortedIndices;
12364 bool IsScatterVectorizeUserTE =
12365 UserTreeIdx.UserTE &&
12366 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12367 AreScatterAllGEPSameBlock =
12381 *SE, SortedIndices));
12382 if (!AreScatterAllGEPSameBlock) {
12383 LLVM_DEBUG(
dbgs() <<
"SLP: Try split and if failed, gathering due to "
12384 "C,S,B,O, small shuffle. \n";
12388 return ScalarsVectorizationLegality(S,
false,
12394 assert(It != VL.
end() &&
"Expected at least one GEP.");
12397 assert(S &&
"Must be valid.");
12403 return ScalarsVectorizationLegality(S,
false,
12409 BasicBlock *BB = S.getMainOp()->getParent();
12412 !DT->isReachableFromEntry(BB)) {
12418 return ScalarsVectorizationLegality(S,
false);
12427 return ScalarsVectorizationLegality(S,
false,
12432 if (S.getOpcode() == Instruction::ExtractElement &&
12435 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
12436 return ScalarsVectorizationLegality(S,
false);
12443 (S.isAltShuffle() || VL.
size() < 4 ||
12450 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
12451 return ScalarsVectorizationLegality(S,
false);
12455 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp() <<
".\n");
12457 const bool IsPHIWithLoop =
12458 S.getOpcode() == Instruction::PHI &&
12459 LI->getLoopFor(S.getMainOp()->getParent()) !=
nullptr;
12460 for (TreeEntry *
E : getTreeEntries(S.getMainOp())) {
12461 if (
E->isSame(VL)) {
12462 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
12464 return ScalarsVectorizationLegality(S,
false);
12472 return ScalarsVectorizationLegality(S,
false);
12476 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
12477 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
12478 if (!AreAllSameInsts ||
isSplat(VL) ||
12482 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O conditions. \n";
12486 return ScalarsVectorizationLegality(S,
false);
12490 if (!EphValues.empty()) {
12491 for (
Value *V : VL) {
12492 if (EphValues.count(V)) {
12494 <<
") is ephemeral.\n");
12496 return ScalarsVectorizationLegality(S,
false,
12508 if (S.isAltShuffle()) {
12509 auto GetNumVectorizedExtracted = [&]() {
12515 all_of(
I->operands(), [&](
const Use &U) {
12516 return isa<ExtractElementInst>(U.get());
12521 else if (!
I->hasOneUser() && !areAllUsersVectorized(
I, UserIgnoreList))
12524 return std::make_pair(Vectorized, Extracted);
12526 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
12528 bool PreferScalarize = !Vectorized.
isAllOnes() && VL.size() == 2;
12529 if (!Vectorized.
isAllOnes() && !PreferScalarize) {
12532 Type *ScalarTy = VL.front()->getType();
12537 false,
true, Kind);
12539 *TTI, ScalarTy, VecTy, Vectorized,
12540 true,
false, Kind,
false);
12541 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
12543 if (PreferScalarize) {
12544 LLVM_DEBUG(
dbgs() <<
"SLP: The instructions are in tree and alternate "
12545 "node is not profitable.\n");
12546 return ScalarsVectorizationLegality(S,
false);
12551 if (UserIgnoreList && !UserIgnoreList->empty()) {
12552 for (
Value *V : VL) {
12553 if (UserIgnoreList->contains(V)) {
12554 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
12555 return ScalarsVectorizationLegality(S,
false);
12560 return ScalarsVectorizationLegality(S,
true);
12565 unsigned InterleaveFactor) {
12568 SmallVector<int> ReuseShuffleIndices;
12572 auto TrySplitNode = [&](
const InstructionsState &LocalState) {
12578 auto Invalid = ScheduleBundle::invalid();
12579 auto *
TE = newTreeEntry(VL, TreeEntry::SplitVectorize,
Invalid, LocalState,
12580 UserTreeIdx, {}, ReorderIndices);
12585 getSameValuesTreeEntry(S.getMainOp(),
Op,
true))) {
12587 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12588 Idx == 0 ? 0 : Op1.
size());
12589 (void)newTreeEntry(
Op, TreeEntry::NeedToGather,
Invalid, S, {
TE, Idx});
12591 TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
12592 Idx == 0 ? 0 : Op1.
size());
12602 bool AreConsts =
false;
12603 for (
Value *V : VL) {
12615 if (AreOnlyConstsWithPHIs(VL)) {
12616 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to all constants and PHIs.\n");
12617 newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
12621 ScalarsVectorizationLegality Legality =
12622 getScalarsVectorizationLegality(VL,
Depth, UserTreeIdx);
12623 InstructionsState S = Legality.getInstructionsState();
12624 if (!Legality.isLegal()) {
12625 if (Legality.trySplitVectorize()) {
12628 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
12631 if (Legality.tryToFindDuplicates())
12633 UserTreeIdx, *
this);
12635 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12640 if (S.isAltShuffle() && TrySplitNode(S))
12646 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12651 bool IsScatterVectorizeUserTE =
12652 UserTreeIdx.UserTE &&
12653 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12657 TreeEntry::EntryState State = getScalarsVectorizationState(
12658 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
12659 if (State == TreeEntry::NeedToGather) {
12660 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12667 if (VectorizableTree.empty()) {
12668 assert(CurrentLoopNest.empty() &&
"Expected empty loop nest");
12670 BasicBlock *Parent = S.getMainOp()->getParent();
12671 if (
const Loop *L = LI->getLoopFor(Parent)) {
12674 CurrentLoopNest.assign(getLoopNest(L));
12676 }
else if (!UserTreeIdx ||
12677 UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize ||
12678 UserTreeIdx.UserTE->isGather() ||
12679 UserTreeIdx.UserTE->getMainOp()->getParent() !=
12680 S.getMainOp()->getParent()) {
12681 BasicBlock *Parent = S.getMainOp()->getParent();
12682 if (
const Loop *L = LI->getLoopFor(Parent)) {
12695 SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
12696 unsigned CommonLen = 0;
12697 for (
const auto [L1, L2] :
zip(CurrentLoopNest, NewLoopNest)) {
12702 auto ValidateMergedBTCs = [&](
unsigned StartDepth) ->
bool {
12703 unsigned EndDepth =
12704 std::min<unsigned>(NewLoopNest.size(), MergedLoopBTCs.size());
12705 for (
unsigned D = StartDepth;
D < EndDepth; ++
D) {
12706 const SCEV *Constraint = MergedLoopBTCs[
D];
12709 const SCEV *NewBTC = SE->getBackedgeTakenCount(NewLoopNest[
D]);
12715 auto BailOutToGather = [&]() {
12717 <<
"SLP: Sibling loops have different trip counts.\n");
12718 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12720 if (CurrentLoopNest.empty()) {
12721 if (!ValidateMergedBTCs(0)) {
12725 CurrentLoopNest.assign(NewLoopNest);
12726 }
else if (CommonLen < CurrentLoopNest.size() &&
12727 CommonLen < NewLoopNest.size()) {
12736 const Loop *SibA = CurrentLoopNest[CommonLen];
12737 const Loop *SibB = NewLoopNest[CommonLen];
12738 const SCEV *BecA = SE->getBackedgeTakenCount(SibA);
12739 const SCEV *BecB = SE->getBackedgeTakenCount(SibB);
12744 if (!ValidateMergedBTCs(CommonLen + 1)) {
12748 if (MergedLoopBTCs.size() <= CommonLen)
12749 MergedLoopBTCs.resize(CommonLen + 1,
nullptr);
12750 MergedLoopBTCs[CommonLen] = BecA;
12751 CurrentLoopNest.truncate(CommonLen);
12752 }
else if (NewLoopNest.size() > CurrentLoopNest.size()) {
12753 if (!ValidateMergedBTCs(CurrentLoopNest.size())) {
12757 CurrentLoopNest.append(
12758 std::next(NewLoopNest.begin(), CurrentLoopNest.size()),
12759 NewLoopNest.end());
12768 auto &BSRef = BlocksSchedules[BB];
12770 BSRef = std::make_unique<BlockScheduling>(BB);
12772 BlockScheduling &BS = *BSRef;
12775 std::optional<ScheduleBundle *> BundlePtr =
12776 BS.tryScheduleBundle(UniqueValues.getArrayRef(),
this, S, UserTreeIdx);
12777#ifdef EXPENSIVE_CHECKS
12781 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12782 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
12784 if (S.isAltShuffle() && ReuseShuffleIndices.
empty() && TrySplitNode(S))
12786 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12787 NonScheduledFirst.insert(VL.front());
12788 if (S.getOpcode() == Instruction::Load &&
12789 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12793 InstructionsCompatibilityAnalysis
Analysis(*DT, *DL, *TTI, *TLI);
12795 ScheduleBundle
Empty;
12796 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() :
Empty;
12797 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
12799 unsigned ShuffleOrOp =
12800 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12801 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &Operands) {
12803 SmallVector<unsigned> PHIOps;
12809 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12814 for (
unsigned I : PHIOps)
12815 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
12817 switch (ShuffleOrOp) {
12818 case Instruction::PHI: {
12820 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12824 TE->setOperands(Operands);
12825 CreateOperandNodes(TE, Operands);
12828 case Instruction::ExtractValue:
12829 case Instruction::ExtractElement: {
12830 if (CurrentOrder.empty()) {
12831 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
12834 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
12836 for (
unsigned Idx : CurrentOrder)
12837 dbgs() <<
" " << Idx;
12844 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12845 ReuseShuffleIndices, CurrentOrder);
12847 "(ExtractValueInst/ExtractElementInst).\n";
12851 TE->setOperands(Operands);
12854 case Instruction::InsertElement: {
12855 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
12857 auto OrdCompare = [](
const std::pair<int, int> &
P1,
12858 const std::pair<int, int> &
P2) {
12859 return P1.first >
P2.first;
12862 decltype(OrdCompare)>
12863 Indices(OrdCompare);
12864 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12866 Indices.emplace(Idx,
I);
12868 OrdersType CurrentOrder(VL.size(), VL.size());
12869 bool IsIdentity =
true;
12870 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
12871 CurrentOrder[Indices.top().second] =
I;
12872 IsIdentity &= Indices.top().second ==
I;
12876 CurrentOrder.clear();
12877 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12879 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
12882 TE->setOperands(Operands);
12883 buildTreeRec(
TE->getOperand(1),
Depth + 1, {TE, 1});
12886 case Instruction::Load: {
12893 TreeEntry *
TE =
nullptr;
12896 case TreeEntry::Vectorize:
12897 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12898 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
12899 if (CurrentOrder.empty())
12900 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (LoadInst).\n";
12904 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
12907 case TreeEntry::CompressVectorize:
12909 TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
12910 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12913 <<
"SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12916 case TreeEntry::StridedVectorize:
12918 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
12919 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
12920 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
12921 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
12924 case TreeEntry::ScatterVectorize:
12926 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
12927 UserTreeIdx, ReuseShuffleIndices);
12930 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12933 case TreeEntry::CombinedVectorize:
12934 case TreeEntry::SplitVectorize:
12935 case TreeEntry::NeedToGather:
12938 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12939 assert(Operands.
size() == 1 &&
"Expected a single operand only");
12940 SmallVector<int>
Mask;
12944 TE->setOperands(Operands);
12945 if (State == TreeEntry::ScatterVectorize)
12946 buildTreeRec(PointerOps,
Depth + 1, {
TE, 0});
12949 case Instruction::ZExt:
12950 case Instruction::SExt:
12951 case Instruction::FPToUI:
12952 case Instruction::FPToSI:
12953 case Instruction::FPExt:
12954 case Instruction::PtrToInt:
12955 case Instruction::IntToPtr:
12956 case Instruction::SIToFP:
12957 case Instruction::UIToFP:
12958 case Instruction::Trunc:
12959 case Instruction::FPTrunc:
12960 case Instruction::BitCast: {
12961 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12962 std::make_pair(std::numeric_limits<unsigned>::min(),
12963 std::numeric_limits<unsigned>::max()));
12964 if (ShuffleOrOp == Instruction::ZExt ||
12965 ShuffleOrOp == Instruction::SExt) {
12966 CastMaxMinBWSizes = std::make_pair(
12967 std::max<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12969 std::min<unsigned>(
12972 }
else if (ShuffleOrOp == Instruction::Trunc) {
12973 CastMaxMinBWSizes = std::make_pair(
12974 std::max<unsigned>(
12977 std::min<unsigned>(DL->getTypeSizeInBits(VL0->
getType()),
12980 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
12981 ReuseShuffleIndices);
12982 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CastInst).\n";
12985 TE->setOperands(Operands);
12987 buildTreeRec(
TE->getOperand(
I),
Depth, {TE, I});
12988 if (ShuffleOrOp == Instruction::Trunc) {
12989 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
12990 }
else if (ShuffleOrOp == Instruction::SIToFP ||
12991 ShuffleOrOp == Instruction::UIToFP) {
12992 unsigned NumSignBits =
12995 APInt
Mask = DB->getDemandedBits(OpI);
12996 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
12998 if (NumSignBits * 2 >=
13000 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13004 case Instruction::ICmp:
13005 case Instruction::FCmp: {
13008 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13009 ReuseShuffleIndices);
13018 "Commutative Predicate mismatch");
13021 Operands.
back() =
Ops.getVL(1);
13028 if (
Cmp->getPredicate() != P0)
13032 TE->setOperands(Operands);
13033 buildTreeRec(Operands.
front(),
Depth, {TE, 0});
13034 buildTreeRec(Operands.
back(),
Depth, {TE, 1});
13035 if (ShuffleOrOp == Instruction::ICmp) {
13036 unsigned NumSignBits0 =
13038 if (NumSignBits0 * 2 >=
13040 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
13041 unsigned NumSignBits1 =
13043 if (NumSignBits1 * 2 >=
13045 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
13049 case Instruction::Select:
13050 case Instruction::FNeg:
13051 case Instruction::Add:
13052 case Instruction::FAdd:
13053 case Instruction::Sub:
13054 case Instruction::FSub:
13055 case Instruction::Mul:
13056 case Instruction::FMul:
13057 case Instruction::UDiv:
13058 case Instruction::SDiv:
13059 case Instruction::FDiv:
13060 case Instruction::URem:
13061 case Instruction::SRem:
13062 case Instruction::FRem:
13063 case Instruction::Shl:
13064 case Instruction::LShr:
13065 case Instruction::AShr:
13066 case Instruction::And:
13067 case Instruction::Or:
13068 case Instruction::Xor:
13069 case Instruction::Freeze: {
13070 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13071 ReuseShuffleIndices);
13073 dbgs() <<
"SLP: added a new TreeEntry "
13074 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
13080 Operands[0] =
Ops.getVL(0);
13081 Operands[1] =
Ops.getVL(1);
13083 TE->setOperands(Operands);
13085 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
13088 case Instruction::GetElementPtr: {
13089 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13090 ReuseShuffleIndices);
13091 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
13093 TE->setOperands(Operands);
13096 buildTreeRec(Operands[
I],
Depth + 1, {
TE,
I});
13099 case Instruction::Store: {
13100 assert(CurrentOrder.empty() &&
13101 "Expected ordered store during tree building");
13102 if (State == TreeEntry::StridedVectorize) {
13104 newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
13105 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
13106 TreeEntryToStridedPtrInfoMap[
TE] = SPtrInfo;
13108 dbgs() <<
"SLP: added a new TreeEntry (strided StoreInst).\n";
13110 TE->setOperands(Operands);
13111 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
13114 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13115 ReuseShuffleIndices, CurrentOrder);
13116 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
13118 TE->setOperands(Operands);
13119 buildTreeRec(
TE->getOperand(0),
Depth + 1, {TE, 0});
13122 case Instruction::Call: {
13128 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13129 ReuseShuffleIndices);
13130 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (CallInst).\n";
13135 Operands[0] =
Ops.getVL(0);
13136 Operands[1] =
Ops.getVL(1);
13138 TE->setOperands(Operands);
13144 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
13148 case Instruction::ShuffleVector: {
13149 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
13150 ReuseShuffleIndices);
13151 if (S.isAltShuffle()) {
13152 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
13157 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
13171 "Expected different main/alternate predicates.");
13187 TE->setOperands(Operands);
13188 buildTreeRec(Operands.
front(),
Depth + 1, {TE, 0});
13189 buildTreeRec(Operands.
back(),
Depth + 1, {TE, 1});
13196 Operands[0] =
Ops.getVL(0);
13197 Operands[1] =
Ops.getVL(1);
13199 TE->setOperands(Operands);
13201 buildTreeRec(
TE->getOperand(
I),
Depth + 1, {TE, I});
13219 for (
const auto *Ty : ST->elements())
13220 if (Ty != *ST->element_begin())
13222 N *= ST->getNumElements();
13223 EltTy = *ST->element_begin();
13225 N *= AT->getNumElements();
13226 EltTy = AT->getElementType();
13229 N *= VT->getNumElements();
13230 EltTy = VT->getElementType();
13236 size_t VTSize = DL->getTypeStoreSizeInBits(
getWidenedType(EltTy,
N));
13237 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
13238 VTSize != DL->getTypeStoreSizeInBits(T))
13245 bool ResizeAllowed)
const {
13247 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
13254 Value *Vec = E0->getOperand(0);
13256 CurrentOrder.
clear();
13260 if (E0->getOpcode() == Instruction::ExtractValue) {
13272 unsigned E = VL.
size();
13273 if (!ResizeAllowed && NElts !=
E)
13276 unsigned MinIdx = NElts, MaxIdx = 0;
13281 if (Inst->getOperand(0) != Vec)
13289 const unsigned ExtIdx = *Idx;
13290 if (ExtIdx >= NElts)
13292 Indices[
I] = ExtIdx;
13293 if (MinIdx > ExtIdx)
13295 if (MaxIdx < ExtIdx)
13298 if (MaxIdx - MinIdx + 1 >
E)
13300 if (MaxIdx + 1 <=
E)
13304 bool ShouldKeepOrder =
true;
13311 for (
unsigned I = 0;
I <
E; ++
I) {
13314 const unsigned ExtIdx = Indices[
I] - MinIdx;
13315 if (CurrentOrder[ExtIdx] !=
E) {
13316 CurrentOrder.
clear();
13319 ShouldKeepOrder &= ExtIdx ==
I;
13320 CurrentOrder[ExtIdx] =
I;
13322 if (ShouldKeepOrder)
13323 CurrentOrder.
clear();
13325 return ShouldKeepOrder;
13328bool BoUpSLP::areAllUsersVectorized(
13329 Instruction *
I,
const SmallDenseSet<Value *> *VectorizedVals)
const {
13330 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
13331 all_of(
I->users(), [
this](User *U) {
13332 return isVectorized(U) || isVectorLikeInstWithConstOps(U) ||
13333 (isa<ExtractElementInst>(U) && MustGather.contains(U));
13338 const InstructionsState &S,
13339 DominatorTree &DT,
const DataLayout &DL,
13340 TargetTransformInfo &TTI,
13341 const TargetLibraryInfo &TLI);
13343unsigned BoUpSLP::getNumScalarInsts()
const {
13344 unsigned Count = 0;
13345 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13346 const TreeEntry &
TE = *Ptr;
13347 if (DeletedNodes.contains(&TE))
13349 if (
TE.isGather() || TransformedToGatherNodes.contains(&TE)) {
13363 if (
TE.State == TreeEntry::CombinedVectorize)
13368 for (
Value *V :
TE.Scalars) {
13370 (
TE.hasCopyableElements() &&
TE.isCopyableElement(V)))
13376 if (
I && (
I->isIntDivRem() ||
I->isFPDivRem()))
13389 if (
TE.CombinedOp == TreeEntry::NotCombinedOp &&
TE.hasState()) {
13390 unsigned Opcode =
TE.getOpcode();
13391 if (Opcode == Instruction::Select) {
13392 for (
Value *V :
TE.Scalars) {
13393 if (
TE.hasCopyableElements() &&
TE.isCopyableElement(V))
13400 assert(
Count > 0 &&
"Underflow in scalar inst count (minmax)");
13404 }
else if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
13405 for (
Value *V :
TE.Scalars) {
13406 if (
TE.hasCopyableElements() &&
TE.isCopyableElement(V))
13409 if (!
I || (
TE.isAltShuffle() &&
I->getOpcode() != Instruction::FAdd &&
13410 I->getOpcode() != Instruction::FSub))
13414 assert(
Count > 0 &&
"Underflow in scalar inst count (fma)");
13424unsigned BoUpSLP::getNumVectorInsts()
const {
13425 unsigned Count = 0;
13426 SmallPtrSet<Value *, 4> GatherExtractSourceVecs;
13427 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
13428 const TreeEntry &
TE = *Ptr;
13429 if (DeletedNodes.contains(&TE))
13431 if (
TE.State == TreeEntry::CombinedVectorize)
13433 bool IsGatherOrTransformed =
13434 TE.isGather() || TransformedToGatherNodes.contains(&TE);
13435 if (IsGatherOrTransformed) {
13436 if (
TE.hasState()) {
13437 if (
const TreeEntry *
E =
13438 getSameValuesTreeEntry(
TE.getMainOp(),
TE.Scalars);
13439 E &&
E != &TE &&
E->getVectorFactor() ==
TE.getVectorFactor())
13442 if (
const TreeEntry *
E =
13443 getSameValuesTreeEntry(
TE.getMainOp(), RevScalars);
13444 E &&
E->getVectorFactor() ==
TE.getVectorFactor()) {
13456 GatherExtractSourceVecs.
insert(EE->getVectorOperand());
13458 for (
Value *V :
TE.Scalars) {
13468 if (
TE.getOpcode() == Instruction::InsertElement ||
13469 TE.getOpcode() == Instruction::ExtractElement)
13471 if (
TE.State == TreeEntry::SplitVectorize)
13475 if (!
TE.ReorderIndices.empty() || !
TE.ReuseShuffleIndices.empty())
13478 Count += GatherExtractSourceVecs.
size();
13481 SmallPtrSet<Value *, 8> CountedExtracts;
13482 for (
const ExternalUser &EU : ExternalUses) {
13485 if (EU.User && EphValues.count(EU.User))
13487 if (ExternalUsesAsOriginalScalar.contains(EU.Scalar))
13489 if (!CountedExtracts.
insert(EU.Scalar).second)
13496void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
13497 const function_ref<
bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
13498 SmallVectorImpl<Value *> *OpScalars,
13499 SmallVectorImpl<Value *> *AltScalars)
const {
13500 unsigned Sz = Scalars.size();
13502 SmallVector<int> OrderMask;
13503 if (!ReorderIndices.empty())
13505 for (
unsigned I = 0;
I < Sz; ++
I) {
13507 if (!ReorderIndices.empty())
13508 Idx = OrderMask[
I];
13512 if (IsAltOp(OpInst)) {
13513 Mask[
I] = Sz + Idx;
13522 if (!ReuseShuffleIndices.
empty()) {
13524 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
13525 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
13527 Mask.swap(NewMask);
13534 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == MainOp;
13544 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
13553 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
13554 "CmpInst expected to match either main or alternate predicate or "
13556 return MainP !=
P && MainP != SwappedP;
13558 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(
I) == AltOp;
13576 return CI->getValue().isPowerOf2();
13582 return CI->getValue().isNegatedPowerOf2();
13587 if (IsConstant && IsUniform)
13589 else if (IsConstant)
13591 else if (IsUniform)
13603class BaseShuffleAnalysis {
13605 Type *ScalarTy =
nullptr;
13607 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
13615 unsigned getVF(
Value *V)
const {
13616 assert(V &&
"V cannot be nullptr");
13618 "V does not have FixedVectorType");
13619 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
13621 unsigned VNumElements =
13623 assert(VNumElements > ScalarTyNumElements &&
13624 "the number of elements of V is not large enough");
13625 assert(VNumElements % ScalarTyNumElements == 0 &&
13626 "the number of elements of V is not a vectorized value");
13627 return VNumElements / ScalarTyNumElements;
13633 static bool isIdentityMask(ArrayRef<int> Mask,
const FixedVectorType *VecTy,
13635 int Limit =
Mask.size();
13647 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
13648 ArrayRef<int> Slice =
Mask.slice(Idx * VF, VF);
13661 static void combineMasks(
unsigned LocalVF, SmallVectorImpl<int> &Mask,
13662 ArrayRef<int> ExtMask) {
13663 unsigned VF =
Mask.size();
13665 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
13668 int MaskedIdx =
Mask[ExtMask[
I] % VF];
13672 Mask.swap(NewMask);
13708 static bool peekThroughShuffles(
Value *&V, SmallVectorImpl<int> &Mask,
13709 bool SinglePermute) {
13711 ShuffleVectorInst *IdentityOp =
nullptr;
13712 SmallVector<int> IdentityMask;
13721 if (isIdentityMask(Mask, SVTy,
false)) {
13722 if (!IdentityOp || !SinglePermute ||
13723 (isIdentityMask(Mask, SVTy,
true) &&
13725 IdentityMask.
size()))) {
13730 IdentityMask.
assign(Mask);
13750 if (SV->isZeroEltSplat()) {
13752 IdentityMask.
assign(Mask);
13754 int LocalVF =
Mask.size();
13757 LocalVF = SVOpTy->getNumElements();
13761 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
13763 ExtMask[Idx] = SV->getMaskValue(
I);
13773 if (!IsOp1Undef && !IsOp2Undef) {
13775 for (
int &
I : Mask) {
13778 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
13784 SmallVector<int> ShuffleMask(SV->getShuffleMask());
13785 combineMasks(LocalVF, ShuffleMask, Mask);
13786 Mask.swap(ShuffleMask);
13788 Op = SV->getOperand(0);
13790 Op = SV->getOperand(1);
13793 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
13798 "Expected masks of same sizes.");
13803 Mask.swap(IdentityMask);
13805 return SinglePermute &&
13808 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
13809 Shuffle->isZeroEltSplat() &&
13813 Shuffle->getShuffleMask()[
P.index()] == 0;
13826 template <
typename T,
typename ShuffleBuilderTy,
typename...
Args>
13827 static T createShuffle(
Value *V1,
Value *V2, ArrayRef<int> Mask,
13828 ShuffleBuilderTy &Builder,
Type *ScalarTy,
13830 assert(V1 &&
"Expected at least one vector value.");
13832 SmallVector<int> NewMask(Mask);
13833 if (ScalarTyNumElements != 1) {
13839 Builder.resizeToMatch(V1, V2);
13840 int VF =
Mask.size();
13842 VF = FTy->getNumElements();
13853 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
13855 CombinedMask1[
I] =
Mask[
I];
13857 CombinedMask2[
I] =
Mask[
I] - VF;
13864 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
13865 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
13871 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
13874 ExtMask1[Idx] = SV1->getMaskValue(
I);
13878 ->getNumElements(),
13879 ExtMask1, UseMask::SecondArg);
13880 SmallVector<int> ExtMask2(CombinedMask2.size(),
PoisonMaskElem);
13881 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
13884 ExtMask2[Idx] = SV2->getMaskValue(
I);
13888 ->getNumElements(),
13889 ExtMask2, UseMask::SecondArg);
13890 if (SV1->getOperand(0)->getType() ==
13891 SV2->getOperand(0)->getType() &&
13892 SV1->getOperand(0)->getType() != SV1->getType() &&
13895 Op1 = SV1->getOperand(0);
13896 Op2 = SV2->getOperand(0);
13897 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
13898 int LocalVF = ShuffleMask1.size();
13900 LocalVF = FTy->getNumElements();
13901 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
13902 CombinedMask1.swap(ShuffleMask1);
13903 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
13904 LocalVF = ShuffleMask2.size();
13906 LocalVF = FTy->getNumElements();
13907 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
13908 CombinedMask2.swap(ShuffleMask2);
13911 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
13912 Builder.resizeToMatch(Op1, Op2);
13914 ->getElementCount()
13915 .getKnownMinValue(),
13917 ->getElementCount()
13918 .getKnownMinValue());
13919 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
13922 "Expected undefined mask element");
13923 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
13932 return Builder.createIdentity(Op1);
13933 return Builder.createShuffleVector(
13938 return Builder.createPoison(
13940 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
13941 assert(V1 &&
"Expected non-null value after looking through shuffles.");
13944 return Builder.createShuffleVector(V1, NewMask,
Arguments...);
13945 return Builder.createIdentity(V1);
13951 ArrayRef<int> Mask) {
13960static std::pair<InstructionCost, InstructionCost>
13971 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13980 ScalarCost =
TTI.getPointersChainCost(
13981 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
13985 for (
Value *V : Ptrs) {
13986 if (V == BasePtr) {
13999 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
14004 VecCost =
TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
14005 TTI::PointersChainInfo::getKnownStride(),
14015 [](
const Value *V) {
14017 return Ptr && !Ptr->hasAllConstantIndices();
14019 ? TTI::PointersChainInfo::getUnknownStride()
14020 : TTI::PointersChainInfo::getKnownStride();
14023 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,
CostKind);
14027 if (It != Ptrs.
end())
14032 VecCost =
TTI.getGEPCost(BaseGEP->getSourceElementType(),
14033 BaseGEP->getPointerOperand(), Indices, VecTy,
14038 return std::make_pair(ScalarCost, VecCost);
14041void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
14042 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
14043 "Expected gather node without reordering.");
14045 SmallSet<size_t, 2> LoadKeyUsed;
14049 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
14054 return VectorizableTree[Idx]->isSame(TE.Scalars);
14058 auto GenerateLoadsSubkey = [&](
size_t Key, LoadInst *LI) {
14063 auto LIt = LoadsMap.
find(std::make_pair(
Key, Ptr));
14064 if (LIt != LoadsMap.
end()) {
14065 for (LoadInst *RLI : LIt->second) {
14067 LI->
getType(), LI->getPointerOperand(), *DL, *SE,
14071 for (LoadInst *RLI : LIt->second) {
14073 LI->getPointerOperand(), *TLI)) {
14078 if (LIt->second.size() > 2) {
14080 hash_value(LIt->second.back()->getPointerOperand());
14086 LoadsMap.
try_emplace(std::make_pair(
Key, Ptr)).first->second.push_back(LI);
14089 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
14090 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
14091 bool IsOrdered =
true;
14092 unsigned NumInstructions = 0;
14096 size_t Key = 1, Idx = 1;
14104 auto &Container = SortedValues[
Key];
14105 if (IsOrdered && !KeyToIndex.
contains(V) &&
14108 ((Container.contains(Idx) &&
14109 KeyToIndex.
at(Container[Idx].back()).back() !=
I - 1) ||
14110 (!Container.empty() && !Container.contains(Idx) &&
14111 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
14113 auto &KTI = KeyToIndex[
V];
14115 Container[Idx].push_back(V);
14120 if (!IsOrdered && NumInstructions > 1) {
14122 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
14123 for (
const auto &
D : SortedValues) {
14124 for (
const auto &
P :
D.second) {
14126 for (
Value *V :
P.second) {
14127 ArrayRef<unsigned> Indices = KeyToIndex.
at(V);
14128 for (
auto [K, Idx] :
enumerate(Indices)) {
14129 TE.ReorderIndices[Cnt +
K] = Idx;
14130 TE.Scalars[Cnt +
K] =
V;
14132 Sz += Indices.
size();
14133 Cnt += Indices.
size();
14137 *TTI,
TE.Scalars.front()->getType(), Sz);
14141 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
14149 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
14154 auto *ScalarTy =
TE.Scalars.front()->getType();
14156 for (
auto [Idx, Sz] : SubVectors) {
14163 int Sz =
TE.Scalars.size();
14164 SmallVector<int> ReorderMask(
TE.ReorderIndices.begin(),
14165 TE.ReorderIndices.end());
14171 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
14175 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
14178 VecTy, ReorderMask);
14184 DemandedElts.clearBit(
I);
14186 ReorderMask[
I] =
I;
14188 ReorderMask[
I] =
I + Sz;
14194 if (!DemandedElts.isAllOnes())
14196 if (
Cost >= BVCost) {
14197 SmallVector<int>
Mask(
TE.ReorderIndices.begin(),
TE.ReorderIndices.end());
14199 TE.ReorderIndices.clear();
14206 const InstructionsState &S,
14212 return V->getType()->getScalarType()->isFloatingPointTy();
14214 "Can only convert to FMA for floating point types");
14215 assert(S.isAddSubLikeOp() &&
"Can only convert to FMA for add/sub");
14220 for (
Value *V : VL) {
14224 if (S.isCopyableElement(
I))
14226 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(
I);
14227 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
14230 FMF &= FPCI->getFastMathFlags();
14234 if (!CheckForContractable(VL))
14237 InstructionsCompatibilityAnalysis
Analysis(DT,
DL,
TTI, TLI);
14244 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
14246 if (!CheckForContractable(Operands.
front()))
14254 for (
Value *V : VL) {
14258 if (!S.isCopyableElement(
I))
14260 FMF &= FPCI->getFastMathFlags();
14261 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
14264 for (
auto [V,
Op] :
zip(VL, Operands.
front())) {
14265 if (S.isCopyableElement(V))
14268 if (!
I || !
I->hasOneUse() || OpS.isCopyableElement(
I)) {
14270 FMACost +=
TTI.getInstructionCost(OpI,
CostKind);
14277 FMF &= FPCI->getFastMathFlags();
14278 FMulPlusFAddCost +=
TTI.getInstructionCost(
I,
CostKind);
14286bool BoUpSLP::matchesShlZExt(
const TreeEntry &TE,
OrdersType &Order,
14287 bool &IsBSwap,
bool &ForLoads)
const {
14288 assert(
TE.hasState() &&
TE.getOpcode() == Instruction::Shl &&
14289 "Expected Shl node.");
14292 if (
TE.State != TreeEntry::Vectorize || !
TE.ReorderIndices.empty() ||
14293 !
TE.ReuseShuffleIndices.empty() || MinBWs.contains(&TE) ||
14294 any_of(
TE.Scalars, [](
Value *V) { return !V->hasOneUse(); }))
14296 Type *ScalarTy =
TE.getMainOp()->getType();
14302 const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
14303 const TreeEntry *LhsTE = getOperandEntry(&TE, 0);
14304 const TreeEntry *RhsTE = getOperandEntry(&TE, 1);
14306 if (!(LhsTE->State == TreeEntry::Vectorize &&
14307 LhsTE->getOpcode() == Instruction::ZExt &&
14308 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
14309 !MinBWs.contains(LhsTE) &&
14310 all_of(LhsTE->Scalars, [](
Value *V) { return V->hasOneUse(); })))
14313 unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
14314 if (!
isPowerOf2_64(Stride) || Stride >= Sz || Sz % Stride != 0 ||
14317 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
14318 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
14321 unsigned CurrentValue = 0;
14323 if (
all_of(RhsTE->Scalars,
14325 CurrentValue += Stride;
14326 if (isa<UndefValue>(V))
14328 auto *C = dyn_cast<Constant>(V);
14331 return C->getUniqueInteger() == CurrentValue - Stride;
14333 CurrentValue <= Sz) {
14336 const unsigned VF = RhsTE->getVectorFactor();
14337 Order.assign(VF, VF);
14339 SmallBitVector SeenPositions(VF);
14342 if (VF * Stride > Sz)
14344 for (
const auto [Idx, V] :
enumerate(RhsTE->Scalars)) {
14350 const APInt &Val =
C->getUniqueInteger();
14355 if (Order[Idx] != VF || Pos >= VF)
14357 if (SeenPositions.test(Pos))
14359 SeenPositions.set(Pos);
14367 auto *SrcType = IntegerType::getIntNTy(ScalarTy->
getContext(),
14368 Stride * LhsTE->getVectorFactor());
14370 SmallPtrSet<Value *, 4> CheckedExtracts;
14372 auto *SrcVecTy =
getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
14374 getCastContextHint(*getOperandEntry(LhsTE, 0));
14376 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF,
CostKind) +
14377 TTI->getArithmeticInstrCost(Instruction::Shl, VecTy,
CostKind,
14378 getOperandInfo(LhsTE->Scalars)) +
14379 TTI->getCastInstrCost(
14380 Instruction::ZExt, VecTy,
14384 Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
CostKind);
14385 if (!Order.empty()) {
14387 SmallVector<int>
Mask;
14393 constexpr unsigned ByteSize = 8;
14395 DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14396 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14398 TTI->getCastInstrCost(Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
14400 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14401 if (BSwapCost <= BitcastCost) {
14402 BitcastCost = BSwapCost;
14406 const TreeEntry *SrcTE = getOperandEntry(LhsTE, 0);
14407 if (SrcTE->State == TreeEntry::Vectorize &&
14408 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
14409 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14410 all_of(SrcTE->Scalars, [](
Value *V) { return V->hasOneUse(); })) {
14412 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
14414 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14415 LI->getPointerAddressSpace(),
CostKind) +
14416 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
14417 if (BSwapCost <= BitcastCost) {
14419 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14420 LI->getPointerAddressSpace(),
CostKind);
14421 BitcastCost = BSwapCost;
14426 }
else if (Order.empty() && DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
14428 const TreeEntry *SrcTE = getOperandEntry(LhsTE, 0);
14429 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
14430 SrcTE->ReuseShuffleIndices.empty() &&
14431 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
14432 all_of(SrcTE->Scalars, [](
Value *V) { return V->hasOneUse(); })) {
14435 TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
14436 LI->getPointerAddressSpace(),
CostKind);
14438 TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
14439 LI->getPointerAddressSpace(),
CostKind);
14443 if (SrcType != ScalarTy) {
14444 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
14447 return BitcastCost < VecCost;
14450bool BoUpSLP::matchesInversedZExtSelect(
14451 const TreeEntry &SelectTE,
14452 SmallVectorImpl<unsigned> &InversedCmpsIndices)
const {
14453 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14454 "Expected select node.");
14456 for (
auto [Idx, V] :
enumerate(SelectTE.Scalars)) {
14458 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
14464 const auto *CmpTE = getOperandEntry(&SelectTE, 0);
14465 const auto *Op1TE = getOperandEntry(&SelectTE, 1);
14466 const auto *Op2TE = getOperandEntry(&SelectTE, 2);
14470 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
14471 (CmpTE->getOpcode() != Instruction::ICmp &&
14472 CmpTE->getOpcode() != Instruction::FCmp) ||
14473 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
14474 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14475 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
14478 if (!Op1TE->isGather() || !Op2TE->isGather())
14481 auto *
Cmp = CmpTE->getMainOp();
14484 if (!
match(Cmp, MatchCmp))
14486 CmpPredicate MainPred = Pred;
14489 for (
const auto [Idx, V] :
enumerate(CmpTE->Scalars)) {
14490 if (!
match(V, MatchCmp))
14496 if (!
V->hasOneUse())
14501 if (InversedCmpsIndices.
empty())
14509 TTI->getCmpSelInstrCost(CmpTE->getOpcode(), VecTy, CmpTy, MainPred,
14510 CostKind, getOperandInfo(CmpTE->getOperand(0)),
14511 getOperandInfo(CmpTE->getOperand(1)));
14516 for (
Value *V : CmpTE->Scalars) {
14520 BVCost += TTI->getInstructionCost(
I,
CostKind);
14522 return VecCost < BVCost;
14525bool BoUpSLP::matchesSelectOfBits(
const TreeEntry &SelectTE)
const {
14526 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
14527 "Expected select node.");
14528 if (DL->isBigEndian())
14530 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
14532 if (!UserIgnoreList || SelectTE.Idx != 0)
14534 if (
any_of(SelectTE.Scalars, [](
Value *V) { return !V->hasOneUse(); }))
14537 if (
any_of(*UserIgnoreList,
14540 const TreeEntry *Op1TE = getOperandEntry(&SelectTE, 1);
14541 const TreeEntry *Op2TE = getOperandEntry(&SelectTE, 2);
14542 if (!Op1TE->isGather() || !Op2TE->isGather())
14545 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
14546 !Op2TE->ReuseShuffleIndices.empty())
14548 Type *ScalarTy = Op1TE->Scalars.front()->getType();
14552 if (
any_of(Op2TE->Scalars, [](
Value *V) { return !match(V, m_ZeroInt()); }))
14557 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(V) &&
14558 Log2_64(V) == P.index());
14562 auto *DstTy = IntegerType::getIntNTy(ScalarTy->
getContext(),
14563 SelectTE.getVectorFactor());
14567 auto It = MinBWs.find(&SelectTE);
14568 if (It != MinBWs.end()) {
14569 auto *EffectiveScalarTy =
14571 VecTy =
getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor());
14576 if (DstTy != ScalarTy) {
14577 BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
14582 TTI->getCmpSelInstrCost(Instruction::Select, VecTy, CmpTy,
14584 getOperandInfo(Op1TE->Scalars),
14585 getOperandInfo(Op2TE->Scalars)) +
14586 TTI->getArithmeticReductionCost(Instruction::Or, VecTy, FMF,
CostKind);
14587 return BitcastCost <= SelectCost;
14592 BaseGraphSize = VectorizableTree.size();
14594 class GraphTransformModeRAAI {
14595 bool &SavedIsGraphTransformMode;
14598 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
14599 : SavedIsGraphTransformMode(IsGraphTransformMode) {
14600 IsGraphTransformMode =
true;
14602 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
14603 } TransformContext(IsGraphTransformMode);
14612 const InstructionsState &S) {
14616 I2->getOperand(
Op));
14617 return all_of(Candidates, [
this](
14618 ArrayRef<std::pair<Value *, Value *>> Cand) {
14620 [](
const std::pair<Value *, Value *> &
P) {
14630 TreeEntry &E = *VectorizableTree[Idx];
14632 reorderGatherNode(E);
14637 constexpr unsigned VFLimit = 16;
14638 bool ForceLoadGather =
14639 count_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14640 return TE->isGather() && TE->hasState() &&
14641 TE->getOpcode() == Instruction::Load &&
14642 TE->getVectorFactor() < VFLimit;
14648 return TE->isSame(VL) ||
all_of(VL, [&](
Value *V) {
14657 auto CheckForSameVectorNodes = [&](
const TreeEntry &E) {
14658 if (E.hasState()) {
14660 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
14661 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
14662 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
14663 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14664 return is_contained(TEs, TE);
14671 !TEs.
empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
14672 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
14673 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14674 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14675 return is_contained(TEs, TE);
14683 if (It != E.Scalars.end()) {
14685 !TEs.empty() &&
any_of(TEs, [&](
const TreeEntry *TE) {
14686 return AreReusedScalars(TE, E.Scalars, [&](
Value *V) {
14687 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14688 return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
14689 return is_contained(TEs, TE);
14699 for (
unsigned Idx :
seq<unsigned>(BaseGraphSize)) {
14700 TreeEntry &
E = *VectorizableTree[Idx];
14701 if (
E.isGather()) {
14704 unsigned MinVF =
getMinVF(2 * Sz);
14707 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
14708 !(!
E.hasState() ||
E.getOpcode() == Instruction::Load ||
14714 if (ForceLoadGather &&
E.hasState() &&
E.getOpcode() == Instruction::Load)
14717 if (CheckForSameVectorNodes(
E))
14721 unsigned StartIdx = 0;
14722 unsigned End = VL.
size();
14723 SmallBitVector Processed(End);
14725 *TTI, VL.
front()->getType(), VL.
size() - 1);
14727 *TTI, VL.
front()->getType(), VF - 1)) {
14728 if (StartIdx + VF > End)
14731 bool AllStrided =
true;
14737 for (
unsigned Cnt = StartIdx; Cnt < End; Cnt += VF) {
14738 const unsigned SliceVF = std::min(VF, End - Cnt);
14745 !getSameValuesTreeEntry(Slice.
front(), Slice,
true))
14752 bool IsSplat =
isSplat(Slice);
14753 bool IsTwoRegisterSplat =
true;
14754 if (IsSplat && VF == 2) {
14758 IsTwoRegisterSplat = NumRegs2VF == 2;
14760 if (Slices.
empty() || !IsSplat || !IsTwoRegisterSplat ||
14768 (S.getOpcode() == Instruction::Load &&
14773 std::optional<bool> MainOpIsCheap;
14774 auto IsMainOpCheap = [&] {
14775 if (!MainOpIsCheap)
14777 TTI->getInstructionCost(S.getMainOp(),
CostKind) <
14779 return *MainOpIsCheap;
14783 if ((!UserIgnoreList ||
E.Idx != 0) && IsMainOpCheap() &&
14791 if (S.getOpcode() == Instruction::Load) {
14796 PointerOps, SPtrInfo);
14807 if (UserIgnoreList &&
E.Idx == 0)
14812 }
else if (S.getOpcode() == Instruction::ExtractElement ||
14813 (IsMainOpCheap() &&
14814 !CheckOperandsProfitability(
14831 if (VF == 2 && AllStrided && Slices.
size() > 2)
14833 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
14834 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
14835 Processed.set(Cnt, Cnt + Sz);
14836 if (StartIdx == Cnt)
14837 StartIdx = Cnt + Sz;
14838 if (End == Cnt + Sz)
14841 for (
auto [Cnt, Sz] : Slices) {
14843 const TreeEntry *SameTE =
nullptr;
14845 It != Slice.
end()) {
14847 SameTE = getSameValuesTreeEntry(*It, Slice);
14849 unsigned PrevSize = VectorizableTree.size();
14850 [[maybe_unused]]
unsigned PrevEntriesSize =
14851 LoadEntriesToVectorize.size();
14852 buildTreeRec(Slice, 0,
EdgeInfo(&
E, UINT_MAX));
14853 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
14854 VectorizableTree[PrevSize]->isGather() &&
14855 VectorizableTree[PrevSize]->hasState() &&
14856 VectorizableTree[PrevSize]->getOpcode() !=
14857 Instruction::ExtractElement &&
14859 if (UserIgnoreList &&
E.Idx == 0 && VF == 2)
14861 VectorizableTree.pop_back();
14862 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
14863 "LoadEntriesToVectorize expected to remain the same");
14866 AddCombinedNode(PrevSize, Cnt, Sz);
14870 if (
E.CombinedEntriesWithIndices.empty() && !
E.ReorderIndices.empty()) {
14871 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
14873 E.ReorderIndices.clear();
14878 switch (
E.getOpcode()) {
14879 case Instruction::Load: {
14882 if (
E.State != TreeEntry::Vectorize)
14884 Type *ScalarTy =
E.getMainOp()->getType();
14890 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14891 SmallVector<int>
Mask;
14895 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
14896 BaseLI->getPointerAddressSpace(),
CostKind,
14900 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
14901 VecTy, BaseLI->getPointerOperand(),
14902 false, CommonAlignment,
14909 ->getPointerOperand()
14912 SPtrInfo.
StrideVal = ConstantInt::get(StrideTy, 1);
14913 SPtrInfo.Ty = VecTy;
14914 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
14915 E.State = TreeEntry::StridedVectorize;
14920 case Instruction::Store: {
14928 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
14929 SmallVector<int>
Mask;
14933 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
14934 BaseSI->getPointerAddressSpace(),
CostKind,
14938 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
14939 VecTy, BaseSI->getPointerOperand(),
14940 false, CommonAlignment,
14943 if (StridedCost < OriginalVecCost) {
14946 E.State = TreeEntry::StridedVectorize;
14948 ->getPointerOperand()
14952 SPtrInfo.Ty = VecTy;
14953 TreeEntryToStridedPtrInfoMap[&
E] = SPtrInfo;
14955 }
else if (!
E.ReorderIndices.empty()) {
14957 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int>
Mask) {
14959 assert(
Mask.size() > 1 &&
"Expected mask greater than 1 element.");
14960 if (
Mask.size() < 4)
14964 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
14965 TTI.isLegalInterleavedAccessType(
14966 VecTy, Factor, BaseSI->getAlign(),
14967 BaseSI->getPointerAddressSpace()))
14973 SmallVector<int>
Mask(
E.ReorderIndices.begin(),
E.ReorderIndices.end());
14974 unsigned InterleaveFactor = IsInterleaveMask(Mask);
14975 if (InterleaveFactor != 0)
14976 E.setInterleave(InterleaveFactor);
14980 case Instruction::Select: {
14981 if (
E.State != TreeEntry::Vectorize)
14986 E.CombinedOp = TreeEntry::MinMax;
14987 TreeEntry *CondEntry = getOperandEntry(&
E, 0);
14988 if (SelectOnly && CondEntry->UserTreeIndex &&
14989 CondEntry->State == TreeEntry::Vectorize) {
14991 CondEntry->State = TreeEntry::CombinedVectorize;
14996 SmallVector<unsigned> InversedCmpsIndices;
14997 if (matchesInversedZExtSelect(
E, InversedCmpsIndices)) {
14998 auto *CmpTE = getOperandEntry(&
E, 0);
14999 auto *Op1TE = getOperandEntry(&
E, 1);
15000 auto *Op2TE = getOperandEntry(&
E, 2);
15002 CmpTE->setOperations(
15003 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
15006 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
15010 auto It = ValueToGatherNodes.find(V);
15011 assert(It != ValueToGatherNodes.end() &&
15012 "Expected to find the value in the map.");
15013 auto &
C = It->getSecond();
15020 for (
const unsigned Idx : InversedCmpsIndices) {
15021 Value *V1 = Op1TE->Scalars[Idx];
15022 Value *V2 = Op2TE->Scalars[Idx];
15023 std::swap(Op1TE->Scalars[Idx], Op2TE->Scalars[Idx]);
15025 UpdateGatherEntry(Op1TE, Op2TE, V1);
15026 UpdateGatherEntry(Op2TE, Op1TE, V2);
15028 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&
E, 1), Op1TE);
15029 OperandsToTreeEntry.emplace_or_assign(std::make_pair(&
E, 2), Op2TE);
15032 if (matchesSelectOfBits(
E)) {
15034 const TreeEntry::CombinedOpcode
Code = TreeEntry::ReducedCmpBitcast;
15035 E.CombinedOp =
Code;
15036 auto *Op1TE = getOperandEntry(&
E, 1);
15037 auto *Op2TE = getOperandEntry(&
E, 2);
15038 Op1TE->State = TreeEntry::CombinedVectorize;
15039 Op1TE->CombinedOp =
Code;
15040 Op2TE->State = TreeEntry::CombinedVectorize;
15041 Op2TE->CombinedOp =
Code;
15046 case Instruction::FSub:
15047 case Instruction::FAdd: {
15049 if (
E.State != TreeEntry::Vectorize ||
15050 !
E.getOperations().isAddSubLikeOp() ||
15051 E.getOperations().isAltShuffle())
15053 const TreeEntry *
LHS = getOperandEntry(&
E, 0);
15054 const TreeEntry *
RHS = getOperandEntry(&
E, 1);
15055 auto IsOneUseVectorFMulOperand = [](
const TreeEntry *
TE) {
15056 return TE->State == TreeEntry::Vectorize &&
15057 TE->ReorderIndices.empty() &&
TE->ReuseShuffleIndices.empty() &&
15058 TE->getOpcode() == Instruction::FMul && !
TE->isAltShuffle() &&
15060 return (TE->hasCopyableElements() &&
15061 TE->isCopyableElement(V)) ||
15065 if (!IsOneUseVectorFMulOperand(
LHS) &&
15066 (
E.getOpcode() == Instruction::FSub ||
15067 !IsOneUseVectorFMulOperand(
RHS)))
15073 E.CombinedOp = TreeEntry::FMulAdd;
15074 TreeEntry *FMulEntry = getOperandEntry(&
E, 0);
15075 if (FMulEntry->UserTreeIndex &&
15076 FMulEntry->State == TreeEntry::Vectorize) {
15078 FMulEntry->State = TreeEntry::CombinedVectorize;
15082 case Instruction::Shl: {
15083 if (
E.Idx != 0 || DL->isBigEndian())
15085 if (!UserIgnoreList)
15095 if (!matchesShlZExt(
E, Order, IsBSwap, ForLoads))
15098 TreeEntry::CombinedOpcode
Code =
15099 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
15100 : TreeEntry::ReducedBitcastBSwap)
15101 : (ForLoads ? TreeEntry::ReducedBitcastLoads
15102 : TreeEntry::ReducedBitcast);
15103 E.CombinedOp =
Code;
15104 E.ReorderIndices = std::move(Order);
15105 TreeEntry *ZExtEntry = getOperandEntry(&
E, 0);
15106 assert(ZExtEntry->UserTreeIndex &&
15107 ZExtEntry->State == TreeEntry::Vectorize &&
15108 ZExtEntry->getOpcode() == Instruction::ZExt &&
15109 "Expected ZExt node.");
15111 ZExtEntry->State = TreeEntry::CombinedVectorize;
15112 ZExtEntry->CombinedOp =
Code;
15114 TreeEntry *LoadsEntry = getOperandEntry(ZExtEntry, 0);
15115 assert(LoadsEntry->UserTreeIndex &&
15116 LoadsEntry->State == TreeEntry::Vectorize &&
15117 LoadsEntry->getOpcode() == Instruction::Load &&
15118 "Expected Load node.");
15120 LoadsEntry->State = TreeEntry::CombinedVectorize;
15121 LoadsEntry->CombinedOp =
Code;
15123 TreeEntry *ConstEntry = getOperandEntry(&
E, 1);
15124 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
15125 "Expected ZExt node.");
15127 ConstEntry->State = TreeEntry::CombinedVectorize;
15128 ConstEntry->CombinedOp =
Code;
15136 if (LoadEntriesToVectorize.empty()) {
15138 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
15139 VectorizableTree.front()->getOpcode() == Instruction::Load)
15142 constexpr unsigned SmallTree = 3;
15143 constexpr unsigned SmallVF = 2;
15144 if ((VectorizableTree.size() <= SmallTree &&
15145 VectorizableTree.front()->Scalars.size() == SmallVF) ||
15146 (VectorizableTree.size() <= 2 && UserIgnoreList))
15149 if (VectorizableTree.front()->isNonPowOf2Vec() &&
15153 [](
const std::unique_ptr<TreeEntry> &TE) {
15154 return TE->isGather() &&
TE->hasState() &&
15155 TE->getOpcode() == Instruction::Load &&
15163 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
15167 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15168 TreeEntry &
E = *
TE;
15169 if (
E.isGather() &&
15170 ((
E.hasState() &&
E.getOpcode() == Instruction::Load) ||
15171 (!
E.hasState() &&
any_of(
E.Scalars,
15173 return isa<LoadInst>(V) &&
15174 !isVectorized(V) &&
15175 !isDeleted(cast<Instruction>(V));
15178 for (
Value *V :
E.Scalars) {
15185 *
this, V, *DL, *SE, *TTI,
15186 GatheredLoads[std::make_tuple(
15194 if (!GatheredLoads.
empty())
15195 tryToVectorizeGatheredLoads(GatheredLoads);
15205 bool IsFinalized =
false;
15231 bool SameNodesEstimated =
true;
15234 if (Ty->getScalarType()->isPointerTy()) {
15238 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
15239 Ty->getScalarType());
15257 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
15260 count(VL, *It) > 1 &&
15262 if (!NeedShuffle) {
15265 return TTI.getShuffleCost(
15270 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
15271 CostKind, std::distance(VL.
begin(), It),
15277 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
15280 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
15284 VecTy, ShuffleMask, CostKind,
15288 return GatherCost +
15291 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
15299 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15300 unsigned NumParts) {
15301 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
15309 return std::max(Sz, VecTy->getNumElements());
15316 -> std::optional<TTI::ShuffleKind> {
15317 if (NumElts <= EltsPerVector)
15318 return std::nullopt;
15323 return std::min(S,
I);
15326 int OffsetReg1 = OffsetReg0;
15330 int FirstRegId = -1;
15331 Indices.assign(1, OffsetReg0);
15335 int Idx =
I - OffsetReg0;
15337 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
15338 if (FirstRegId < 0)
15339 FirstRegId = RegId;
15340 RegIndices.
insert(RegId);
15341 if (RegIndices.
size() > 2)
15342 return std::nullopt;
15343 if (RegIndices.
size() == 2) {
15345 if (Indices.
size() == 1) {
15348 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
15349 [&](
int S,
int I) {
15350 if (I == PoisonMaskElem)
15352 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
15353 ((I - OffsetReg0) % NumElts) / EltsPerVector;
15354 if (RegId == FirstRegId)
15356 return std::min(S, I);
15359 unsigned Index = OffsetReg1 % NumElts;
15360 Indices.push_back(Index);
15361 SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
15363 Idx =
I - OffsetReg1;
15365 I = (Idx % NumElts) % EltsPerVector +
15366 (RegId == FirstRegId ? 0 : EltsPerVector);
15368 return ShuffleKind;
15376 if (!ShuffleKinds[Part])
15379 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
15384 std::optional<TTI::ShuffleKind> RegShuffleKind =
15385 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
15386 if (!RegShuffleKind) {
15389 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
15402 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
15403 for (
const auto [Idx, SubVecSize] :
zip(Indices, SubVecSizes)) {
15404 assert((Idx + SubVecSize) <= BaseVF &&
15405 "SK_ExtractSubvector index out of range");
15415 TTI, *ShuffleKinds[Part],
getWidenedType(ScalarTy, NumElts), SubMask);
15416 if (OriginalCost < Cost)
15417 Cost = OriginalCost;
15424 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
15426 unsigned SliceSize) {
15427 if (SameNodesEstimated) {
15433 if ((InVectors.size() == 2 &&
15437 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
15440 "Expected all poisoned elements.");
15442 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
15447 Cost += createShuffle(InVectors.front(),
15448 InVectors.size() == 1 ?
nullptr : InVectors.back(),
15450 transformMaskAfterShuffle(CommonMask, CommonMask);
15451 }
else if (InVectors.size() == 2) {
15452 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15453 transformMaskAfterShuffle(CommonMask, CommonMask);
15455 SameNodesEstimated =
false;
15456 if (!E2 && InVectors.size() == 1) {
15457 unsigned VF = E1.getVectorFactor();
15459 VF = std::max(VF, getVF(V1));
15462 VF = std::max(VF, E->getVectorFactor());
15464 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15466 CommonMask[Idx] = Mask[Idx] + VF;
15467 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
15468 transformMaskAfterShuffle(CommonMask, CommonMask);
15470 auto P = InVectors.front();
15471 Cost += createShuffle(&E1, E2, Mask);
15472 unsigned VF = Mask.size();
15478 VF = std::max(VF, E->getVectorFactor());
15480 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15482 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
15483 Cost += createShuffle(
P, InVectors.front(), CommonMask);
15484 transformMaskAfterShuffle(CommonMask, CommonMask);
15488 class ShuffleCostBuilder {
15491 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
15493 return Mask.empty() ||
15494 (VF == Mask.size() &&
15502 ~ShuffleCostBuilder() =
default;
15508 if (isEmptyOrIdentity(Mask, VF))
15518 if (isEmptyOrIdentity(Mask, VF))
15520 return ::getShuffleCost(
15528 void resizeToMatch(
Value *&,
Value *&)
const {}
15538 ShuffleCostBuilder Builder(TTI);
15541 unsigned CommonVF = Mask.size();
15543 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
15547 Type *EScalarTy = E.Scalars.front()->getType();
15548 bool IsSigned =
true;
15549 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
15551 IsSigned = It->second.second;
15553 if (EScalarTy != ScalarTy) {
15554 unsigned CastOpcode = Instruction::Trunc;
15555 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15556 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15558 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15559 return TTI.getCastInstrCost(CastOpcode,
getWidenedType(ScalarTy, VF),
15569 Type *EScalarTy = VecTy->getElementType();
15570 if (EScalarTy != ScalarTy) {
15572 unsigned CastOpcode = Instruction::Trunc;
15573 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
15574 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
15576 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15577 return TTI.getCastInstrCost(
15583 if (!V1 && !V2 && !P2.isNull()) {
15586 unsigned VF = E->getVectorFactor();
15588 CommonVF = std::max(VF, E2->getVectorFactor());
15591 return Idx < 2 * static_cast<int>(CommonVF);
15593 "All elements in mask must be less than 2 * CommonVF.");
15594 if (E->Scalars.size() == E2->Scalars.size()) {
15598 for (
int &Idx : CommonMask) {
15601 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
15603 else if (Idx >=
static_cast<int>(CommonVF))
15604 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
15608 CommonVF = E->Scalars.size();
15609 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
15610 GetNodeMinBWAffectedCost(*E2, CommonVF);
15612 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
15613 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
15616 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15617 }
else if (!V1 && P2.isNull()) {
15620 unsigned VF = E->getVectorFactor();
15624 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
15625 "All elements in mask must be less than CommonVF.");
15626 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
15628 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
15629 for (
int &Idx : CommonMask) {
15633 CommonVF = E->Scalars.size();
15634 }
else if (
unsigned Factor = E->getInterleaveFactor();
15635 Factor > 0 && E->Scalars.size() != Mask.size() &&
15639 std::iota(CommonMask.begin(), CommonMask.end(), 0);
15641 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
15644 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
15645 CommonVF == CommonMask.size() &&
15647 [](
const auto &&
P) {
15649 static_cast<unsigned>(
P.value()) !=
P.index();
15657 }
else if (V1 && P2.isNull()) {
15659 ExtraCost += GetValueMinBWAffectedCost(V1);
15660 CommonVF = getVF(V1);
15663 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
15664 "All elements in mask must be less than CommonVF.");
15665 }
else if (V1 && !V2) {
15667 unsigned VF = getVF(V1);
15669 CommonVF = std::max(VF, E2->getVectorFactor());
15672 return Idx < 2 * static_cast<int>(CommonVF);
15674 "All elements in mask must be less than 2 * CommonVF.");
15675 if (E2->Scalars.size() == VF && VF != CommonVF) {
15677 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
15678 for (
int &Idx : CommonMask) {
15681 if (Idx >=
static_cast<int>(CommonVF))
15682 Idx = E2Mask[Idx - CommonVF] + VF;
15686 ExtraCost += GetValueMinBWAffectedCost(V1);
15688 ExtraCost += GetNodeMinBWAffectedCost(
15689 *E2, std::min(CommonVF, E2->getVectorFactor()));
15690 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15691 }
else if (!V1 && V2) {
15693 unsigned VF = getVF(V2);
15695 CommonVF = std::max(VF, E1->getVectorFactor());
15698 return Idx < 2 * static_cast<int>(CommonVF);
15700 "All elements in mask must be less than 2 * CommonVF.");
15701 if (E1->Scalars.size() == VF && VF != CommonVF) {
15703 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
15704 for (
int &Idx : CommonMask) {
15707 if (Idx >=
static_cast<int>(CommonVF))
15708 Idx = E1Mask[Idx - CommonVF] + VF;
15714 ExtraCost += GetNodeMinBWAffectedCost(
15715 *E1, std::min(CommonVF, E1->getVectorFactor()));
15717 ExtraCost += GetValueMinBWAffectedCost(V2);
15718 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15720 assert(V1 && V2 &&
"Expected both vectors.");
15721 unsigned VF = getVF(V1);
15722 CommonVF = std::max(VF, getVF(V2));
15725 return Idx < 2 * static_cast<int>(CommonVF);
15727 "All elements in mask must be less than 2 * CommonVF.");
15729 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
15732 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15737 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
15740 InVectors.front() =
15742 if (InVectors.size() == 2)
15743 InVectors.pop_back();
15744 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
15745 V1, V2, CommonMask, Builder, ScalarTy, VL);
15752 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
15753 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
15754 CheckedExtracts(CheckedExtracts) {}
15756 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15757 unsigned NumParts,
bool &UseVecBaseAsInput) {
15758 UseVecBaseAsInput =
false;
15761 Value *VecBase =
nullptr;
15763 if (!E->ReorderIndices.empty()) {
15765 E->ReorderIndices.end());
15770 bool PrevNodeFound =
any_of(
15771 ArrayRef(R.VectorizableTree).take_front(E->Idx),
15772 [&](
const std::unique_ptr<TreeEntry> &TE) {
15773 return ((TE->hasState() && !TE->isAltShuffle() &&
15774 TE->getOpcode() == Instruction::ExtractElement) ||
15776 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
15777 return VL.size() > Data.index() &&
15778 (Mask[Data.index()] == PoisonMaskElem ||
15779 isa<UndefValue>(VL[Data.index()]) ||
15780 Data.value() == VL[Data.index()]);
15788 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
15802 VecBase = EE->getVectorOperand();
15803 UniqueBases.
insert(VecBase);
15805 if (!CheckedExtracts.
insert(V).second ||
15808 [&](
const TreeEntry *TE) {
15809 return R.DeletedNodes.contains(TE) ||
15810 R.TransformedToGatherNodes.contains(TE);
15812 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
15813 !R.isVectorized(EE) &&
15815 count_if(E->UserTreeIndex.UserTE->Scalars,
15816 [&](
Value *V) { return V == EE; })) ||
15819 return isa<GetElementPtrInst>(U) &&
15820 !R.areAllUsersVectorized(cast<Instruction>(U),
15828 unsigned Idx = *EEIdx;
15830 if (EE->hasOneUse() || !PrevNodeFound) {
15836 Cost -=
TTI.getExtractWithExtendCost(
15840 Cost +=
TTI.getCastInstrCost(
15846 APInt &DemandedElts =
15847 VectorOpsToExtracts
15850 .first->getSecond();
15851 DemandedElts.
setBit(Idx);
15854 for (
const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
15856 DemandedElts,
false,
15864 if (!PrevNodeFound)
15865 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
15868 transformMaskAfterShuffle(CommonMask, CommonMask);
15869 SameNodesEstimated =
false;
15870 if (NumParts != 1 && UniqueBases.
size() != 1) {
15871 UseVecBaseAsInput =
true;
15879 std::optional<InstructionCost>
15883 return std::nullopt;
15887 IsFinalized =
false;
15888 CommonMask.clear();
15891 VectorizedVals.clear();
15892 SameNodesEstimated =
true;
15899 return Idx < static_cast<int>(E1.getVectorFactor());
15901 "Expected single vector shuffle mask.");
15905 if (InVectors.empty()) {
15906 CommonMask.assign(Mask.begin(), Mask.end());
15907 InVectors.assign({&E1, &E2});
15910 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
15912 unsigned NumParts =
15916 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15917 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
15921 if (InVectors.empty()) {
15922 CommonMask.assign(Mask.begin(), Mask.end());
15923 InVectors.assign(1, &E1);
15926 assert(!CommonMask.empty() &&
"Expected non-empty common mask.");
15928 unsigned NumParts =
15932 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
15933 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
15934 if (!SameNodesEstimated && InVectors.size() == 1)
15935 InVectors.emplace_back(&E1);
15941 assert(InVectors.size() == 1 &&
15948 ->getOrdered(
P.index()));
15949 return EI->getVectorOperand() == V1 ||
15950 EI->getVectorOperand() == V2;
15952 "Expected extractelement vectors.");
15958 if (InVectors.empty()) {
15959 assert(CommonMask.empty() && !ForExtracts &&
15960 "Expected empty input mask/vectors.");
15961 CommonMask.assign(Mask.begin(), Mask.end());
15962 InVectors.assign(1, V1);
15968 !CommonMask.empty() &&
15972 ->getOrdered(
P.index());
15974 return P.value() == Mask[
P.index()] ||
15979 return EI->getVectorOperand() == V1;
15981 "Expected only tree entry for extractelement vectors.");
15984 assert(!InVectors.empty() && !CommonMask.empty() &&
15985 "Expected only tree entries from extracts/reused buildvectors.");
15986 unsigned VF = getVF(V1);
15987 if (InVectors.size() == 2) {
15988 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
15989 transformMaskAfterShuffle(CommonMask, CommonMask);
15990 VF = std::max<unsigned>(VF, CommonMask.size());
15991 }
else if (
const auto *InTE =
15992 InVectors.front().dyn_cast<
const TreeEntry *>()) {
15993 VF = std::max(VF, InTE->getVectorFactor());
15997 ->getNumElements());
15999 InVectors.push_back(V1);
16000 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16002 CommonMask[Idx] = Mask[Idx] + VF;
16005 Value *Root =
nullptr) {
16006 Cost += getBuildVectorCost(VL, Root);
16008 if (BVValues->empty() && InVectors.empty())
16009 BVValues->assign(VL.
begin(), VL.
end());
16016 unsigned VF = VL.
size();
16018 VF = std::min(VF, MaskVF);
16019 Type *VLScalarTy = VL.
front()->getType();
16043 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
16049 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
16054 IsFinalized =
true;
16057 if (InVectors.
size() == 2)
16058 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
16060 Cost += createShuffle(Vec,
nullptr, CommonMask);
16061 transformMaskAfterShuffle(CommonMask, CommonMask);
16063 "Expected vector length for the final value before action.");
16066 Cost += createShuffle(V1, V2, Mask);
16069 InVectors.
front() = V;
16071 if (!SubVectors.empty()) {
16073 if (InVectors.
size() == 2)
16074 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
16076 Cost += createShuffle(Vec,
nullptr, CommonMask);
16077 transformMaskAfterShuffle(CommonMask, CommonMask);
16079 if (!SubVectorsMask.
empty()) {
16081 "Expected same size of masks for subvectors and common mask.");
16083 copy(SubVectorsMask, SVMask.begin());
16084 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
16087 I1 = I2 + CommonMask.
size();
16094 for (
auto [
E, Idx] : SubVectors) {
16095 Type *EScalarTy =
E->Scalars.front()->getType();
16096 bool IsSigned =
true;
16097 if (
auto It =
R.MinBWs.find(
E); It !=
R.MinBWs.end()) {
16100 IsSigned = It->second.second;
16102 if (ScalarTy != EScalarTy) {
16103 unsigned CastOpcode = Instruction::Trunc;
16104 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
16105 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
16107 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
16108 Cost += TTI.getCastInstrCost(
16117 if (!CommonMask.
empty()) {
16118 std::iota(std::next(CommonMask.
begin(), Idx),
16119 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
16125 if (!ExtMask.
empty()) {
16126 if (CommonMask.
empty()) {
16130 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
16133 NewMask[
I] = CommonMask[ExtMask[
I]];
16135 CommonMask.
swap(NewMask);
16138 if (CommonMask.
empty()) {
16139 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
16146 createShuffle(InVectors.
front(),
16147 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
16152 assert((IsFinalized || CommonMask.empty()) &&
16153 "Shuffle construction must be finalized.");
16157const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
16158 unsigned Idx)
const {
16159 TreeEntry *
Op = OperandsToTreeEntry.
at({E, Idx});
16160 assert(
Op->isSame(
E->getOperand(Idx)) &&
"Operands mismatch!");
16165 if (
TE.State == TreeEntry::ScatterVectorize ||
16166 TE.State == TreeEntry::StridedVectorize)
16168 if (
TE.State == TreeEntry::CompressVectorize)
16170 if (
TE.State == TreeEntry::Vectorize &&
TE.getOpcode() == Instruction::Load &&
16171 !
TE.isAltShuffle()) {
16172 if (
TE.ReorderIndices.empty())
16174 SmallVector<int>
Mask;
16193 if (!L->getExitingBlock())
16200uint64_t BoUpSLP::getScaleToLoopIterations(
const TreeEntry &TE,
Value *Scalar,
16204 Parent =
U->getParent();
16205 }
else if (
TE.isGather() ||
TE.State == TreeEntry::SplitVectorize) {
16207 while (EI.UserTE) {
16208 if (EI.UserTE->isGather() ||
16209 EI.UserTE->State == TreeEntry::SplitVectorize) {
16210 EI = EI.UserTE->UserTreeIndex;
16213 if (EI.UserTE->State == TreeEntry::Vectorize &&
16214 EI.UserTE->getOpcode() == Instruction::PHI) {
16216 Parent = PH->getIncomingBlock(EI.EdgeIdx);
16218 Parent = EI.UserTE->getMainOp()->
getParent();
16225 Parent =
TE.getMainOp()->getParent();
16227 const Loop *
L = LI->getLoopFor(Parent);
16237uint64_t BoUpSLP::getLoopNestScale(
const Loop *L) {
16240 if (
auto It = LoopNestScaleCache.find(L); It != LoopNestScaleCache.end())
16245 SmallVector<const Loop *> Chain;
16246 for (
const Loop *Cur = L; Cur; Cur = Cur->getParentLoop()) {
16247 if (LoopNestScaleCache.contains(Cur))
16251 assert(!Chain.
empty() &&
"Early-return above should have handled cache hit.");
16252 uint64_t Scale = 1;
16253 if (
const Loop *Parent = Chain.
back()->getParentLoop())
16254 Scale = LoopNestScaleCache.lookup(Parent);
16258 for (
const Loop *Cur :
reverse(Chain)) {
16261 LoopNestScaleCache.try_emplace(Cur, std::max<uint64_t>(1, Scale));
16263 return std::max<uint64_t>(1, Scale);
16266uint64_t BoUpSLP::getGatherNodeEffectiveScale(
const TreeEntry &TE) {
16270 assert((
TE.isGather() ||
TE.State == TreeEntry::SplitVectorize) &&
16271 "Expected gather/split tree entry.");
16273 uint64_t BaseScale = getScaleToLoopIterations(TE);
16293 bool Overflow =
false;
16294 for (
Value *V :
TE.Scalars) {
16298 uint64_t LaneScale = std::min(getScaleToLoopIterations(TE, V), BaseScale);
16306 uint64_t Numerator =
SaturatingAdd(Sum, uint64_t(
N - 1), &Overflow);
16309 uint64_t Avg = Numerator /
N;
16310 return std::clamp<uint64_t>(Avg, 1, BaseScale);
16314BoUpSLP::getVectorSpillReloadCost(
const TreeEntry *
E,
Type *ScalarTy,
16315 VectorType *VecTy, VectorType *FinalVecTy,
16324 if (!
E->hasState() ||
E->getOpcode() == Instruction::Store ||
16325 E->getOpcode() == Instruction::ExtractElement ||
16326 E->getOpcode() == Instruction::ExtractValue ||
16327 E->getOpcode() == Instruction::Freeze ||
16328 (
E->getOpcode() == Instruction::Load &&
16329 E->State != TreeEntry::ScatterVectorize))
16330 return SpillsReloads;
16333 E->State == TreeEntry::Vectorize &&
E->getOpcode() == Instruction::PHI;
16334 SmallPtrSet<const TreeEntry *, 8> CountedOpEntries;
16335 SmallDenseMap<unsigned, unsigned> PressureByClass;
16336 auto AddPartsToClass = [&](
unsigned RegClass,
unsigned Parts) {
16337 assert(Parts != 0 &&
"Expected non-zero number of parts (registers).");
16338 PressureByClass[RegClass] += Parts;
16341 auto GetEntryVecTy =
16342 [&](
const TreeEntry *
TE) -> std::pair<Type *, VectorType *> {
16344 auto BWIt = MinBWs.find(TE);
16345 if (BWIt != MinBWs.end()) {
16351 return std::make_pair(ScalarTy,
16355 if (
E->State == TreeEntry::SplitVectorize) {
16356 for (
const auto &[Idx,
_] :
E->CombinedEntriesWithIndices) {
16357 const TreeEntry *OpTE = VectorizableTree[Idx].get();
16359 if (!CountedOpEntries.
insert(OpTE).second)
16361 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16365 const unsigned RC =
16366 TTI->getRegisterClassForType(
true, OpVecTy);
16367 AddPartsToClass(RC, Parts);
16369 }
else if (IsPHI) {
16372 SmallDenseMap<unsigned, unsigned> MaxOpPressureByClass;
16374 const TreeEntry *OpTE = getOperandEntry(
E, Idx);
16375 auto [ScalarTy, OpVecTy] = GetEntryVecTy(OpTE);
16379 const unsigned RC =
16380 TTI->getRegisterClassForType(
true, OpVecTy);
16381 MaxOpPressureByClass[RC] = std::max(MaxOpPressureByClass[RC], Parts);
16383 for (
auto [RC, Parts] : MaxOpPressureByClass)
16384 AddPartsToClass(RC, Parts);
16389 if (
E->getOpcode() == Instruction::InsertElement && Idx == 0)
16397 const TreeEntry *OpTE = getOperandEntry(
E, Idx);
16399 if (!CountedOpEntries.
insert(OpTE).second)
16405 const unsigned RC =
16406 TTI->getRegisterClassForType(
true, OpVecTy);
16407 AddPartsToClass(RC, Parts);
16411 if (
E->getOpcode() != Instruction::Load) {
16413 if (ResParts != 0) {
16414 const unsigned RC = TTI->getRegisterClassForType(
true, VecTy);
16415 AddPartsToClass(RC, ResParts);
16417 if (VecTy != FinalVecTy) {
16418 const unsigned FinalResParts =
16420 if (FinalResParts != 0) {
16421 const unsigned RC =
16422 TTI->getRegisterClassForType(
true, FinalVecTy);
16423 AddPartsToClass(RC, FinalResParts);
16428 for (
auto [RegClass, UsedRegs] : PressureByClass) {
16429 const unsigned NumAvailRegs = TTI->getNumberOfRegisters(RegClass);
16430 if (NumAvailRegs == 0 || UsedRegs <= NumAvailRegs)
16432 const unsigned SpillCount = UsedRegs - NumAvailRegs;
16434 TTI->getRegisterClassReloadCost(RegClass,
CostKind);
16437 if (
E->Idx > 0 || !UserIgnoreList || !
E->Scalars[0]->getType()->isVoidTy())
16438 SingleRegSpillReload +=
16439 TTI->getRegisterClassSpillCost(RegClass,
CostKind);
16440 SpillsReloads += SingleRegSpillReload * SpillCount;
16442 return SpillsReloads;
16447 SmallPtrSetImpl<Value *> &CheckedExtracts) {
16451 if (
SLPReVec &&
E->State == TreeEntry::Vectorize &&
16452 E->getOpcode() == Instruction::InsertElement &&
16453 !
E->getOperand(1).back()->getType()->isVectorTy())
16456 return InstructionCost::getInvalid();
16461 auto It = MinBWs.find(
E);
16462 Type *OrigScalarTy = ScalarTy;
16463 if (It != MinBWs.end()) {
16469 const TreeEntry *ZExt = getOperandEntry(
E, 0);
16473 unsigned EntryVF =
E->getVectorFactor();
16477 getVectorSpillReloadCost(
E, ScalarTy, VecTy, FinalVecTy,
CostKind);
16478 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
16482 return InstructionCost::getInvalid();
16483 return SpillsReloads +
16484 processBuildVector<ShuffleCostEstimator, InstructionCost>(
16485 E, ScalarTy, *TTI, VectorizedVals, *
this, CheckedExtracts);
16487 if (
E->State == TreeEntry::SplitVectorize) {
16488 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
16489 "Expected exactly 2 combined entries.");
16490 assert(
E->ReuseShuffleIndices.empty() &&
"Expected empty reuses mask.");
16492 if (
E->ReorderIndices.empty()) {
16495 E->CombinedEntriesWithIndices.back().second,
16498 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
16499 ->getVectorFactor()));
16501 unsigned CommonVF =
16502 std::max(VectorizableTree[
E->CombinedEntriesWithIndices.front().first]
16503 ->getVectorFactor(),
16504 VectorizableTree[
E->CombinedEntriesWithIndices.back().first]
16505 ->getVectorFactor());
16510 VectorCost += SpillsReloads;
16511 LLVM_DEBUG(dumpTreeCosts(
E, 0, VectorCost, 0,
"Calculated costs for Tree"));
16515 SmallVector<int>
Mask;
16516 if (!
E->ReorderIndices.empty() &&
E->State != TreeEntry::CompressVectorize &&
16517 (
E->State != TreeEntry::StridedVectorize ||
16519 SmallVector<int> NewMask;
16520 if (
E->getOpcode() == Instruction::Store) {
16522 NewMask.
resize(
E->ReorderIndices.size());
16529 if (!
E->ReuseShuffleIndices.empty())
16534 assert((
E->State == TreeEntry::Vectorize ||
16535 E->State == TreeEntry::ScatterVectorize ||
16536 E->State == TreeEntry::StridedVectorize ||
16537 E->State == TreeEntry::CompressVectorize) &&
16538 "Unhandled state");
16541 (
E->getOpcode() == Instruction::GetElementPtr &&
16542 E->getMainOp()->getType()->isPointerTy()) ||
16543 E->hasCopyableElements()) &&
16546 unsigned ShuffleOrOp =
16547 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->getOpcode();
16548 if (
E->CombinedOp != TreeEntry::NotCombinedOp)
16549 ShuffleOrOp =
E->CombinedOp;
16550 SmallSetVector<Value *, 16> UniqueValues;
16551 SmallVector<unsigned, 16> UniqueIndexes;
16553 if (UniqueValues.insert(V))
16554 UniqueIndexes.push_back(Idx);
16555 const unsigned Sz = UniqueValues.size();
16556 SmallBitVector UsedScalars(Sz,
false);
16557 for (
unsigned I = 0;
I < Sz; ++
I) {
16559 !
E->isCopyableElement(UniqueValues[
I]) &&
16560 getTreeEntries(UniqueValues[
I]).
front() ==
E)
16562 UsedScalars.set(
I);
16564 auto GetCastContextHint = [&](
Value *
V) {
16566 return getCastContextHint(*OpTEs.front());
16567 InstructionsState SrcState =
getSameOpcode(
E->getOperand(0), *TLI);
16568 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
16569 !SrcState.isAltShuffle())
16582 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
16584 for (
unsigned I = 0;
I < Sz; ++
I) {
16585 if (UsedScalars.test(
I))
16587 ScalarCost += ScalarEltCost(
I);
16594 if (It != MinBWs.end() && !UnaryInstruction::isCast(
E->getOpcode()) &&
16596 (
E->getOpcode() != Instruction::Load ||
E->UserTreeIndex)) {
16598 if (!EI.UserTE->hasState() ||
16599 EI.UserTE->getOpcode() != Instruction::Select ||
16601 auto UserBWIt = MinBWs.find(EI.UserTE);
16602 Type *UserScalarTy =
16603 (EI.UserTE->isGather() ||
16604 EI.UserTE->State == TreeEntry::SplitVectorize)
16605 ? EI.UserTE->Scalars.front()->getType()
16606 : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
16607 if (UserBWIt != MinBWs.end())
16609 UserBWIt->second.first);
16610 if (ScalarTy != UserScalarTy) {
16611 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16612 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
16613 unsigned VecOpcode;
16615 if (BWSz > SrcBWSz)
16616 VecOpcode = Instruction::Trunc;
16619 It->second.second ? Instruction::SExt : Instruction::ZExt;
16621 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
16626 VecCost += SpillsReloads;
16627 LLVM_DEBUG(dumpTreeCosts(
E, CommonCost, VecCost - CommonCost,
16628 ScalarCost,
"Calculated costs for Tree"));
16629 return VecCost - ScalarCost;
16634 assert((
E->State == TreeEntry::Vectorize ||
16635 E->State == TreeEntry::StridedVectorize ||
16636 E->State == TreeEntry::CompressVectorize) &&
16637 "Entry state expected to be Vectorize, StridedVectorize or "
16638 "MaskedLoadCompressVectorize here.");
16642 *TTI, Ptrs, BasePtr,
E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
16643 LLVM_DEBUG(dumpTreeCosts(
E, 0, VecCost, ScalarCost,
16644 "Calculated GEPs cost for Tree"));
16646 return VecCost - ScalarCost + SpillsReloads;
16652 return InstructionCost::getInvalid();
16653 Type *CanonicalType = Ty;
16659 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
16660 {CanonicalType, CanonicalType});
16662 TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
16665 if (VI && SelectOnly) {
16667 "Expected only for scalar type.");
16670 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
16671 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
16672 {TTI::OK_AnyValue, TTI::OP_None}, CI);
16676 auto GetFMulAddCost = [&, &TTI = *TTI](
const InstructionsState &S,
16681 switch (ShuffleOrOp) {
16682 case Instruction::PHI: {
16685 SmallPtrSet<const TreeEntry *, 4> CountedOps;
16686 for (
Value *V : UniqueValues) {
16691 ValueList Operands(
PHI->getNumIncomingValues(),
nullptr);
16692 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
16696 if (
const TreeEntry *OpTE =
16697 getSameValuesTreeEntry(Operands.
front(), Operands))
16698 if (CountedOps.
insert(OpTE).second &&
16699 !OpTE->ReuseShuffleIndices.empty())
16700 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
16701 OpTE->Scalars.size());
16704 return CommonCost - ScalarCost + SpillsReloads;
16706 case Instruction::ExtractValue:
16707 case Instruction::ExtractElement: {
16708 APInt DemandedElts;
16710 auto GetScalarCost = [&](
unsigned Idx) {
16716 if (ShuffleOrOp == Instruction::ExtractElement) {
16718 SrcVecTy = EE->getVectorOperandType();
16721 Type *AggregateTy = EV->getAggregateOperand()->getType();
16724 NumElts = ATy->getNumElements();
16730 if (
I->hasOneUse()) {
16740 Cost -= TTI->getCastInstrCost(
16746 if (DemandedElts.
isZero())
16752 return CommonCost - (DemandedElts.
isZero()
16754 : TTI.getScalarizationOverhead(
16755 SrcVecTy, DemandedElts,
false,
16758 return GetCostDiff(GetScalarCost, GetVectorCost);
16760 case Instruction::InsertElement: {
16761 assert(
E->ReuseShuffleIndices.empty() &&
16762 "Unique insertelements only are expected.");
16764 unsigned const NumElts = SrcVecTy->getNumElements();
16765 unsigned const NumScalars = VL.
size();
16767 unsigned NumOfParts =
16772 unsigned OffsetEnd = OffsetBeg;
16773 InsertMask[OffsetBeg] = 0;
16776 if (OffsetBeg > Idx)
16778 else if (OffsetEnd < Idx)
16780 InsertMask[Idx] =
I + 1;
16783 if (NumOfParts > 0 && NumOfParts < NumElts)
16784 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
16785 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
16787 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
16788 unsigned InsertVecSz = std::min<unsigned>(
16790 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
16791 bool IsWholeSubvector =
16792 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
16796 if (OffsetBeg + InsertVecSz > VecSz) {
16799 InsertVecSz = VecSz;
16804 SmallVector<int>
Mask;
16805 if (!
E->ReorderIndices.empty()) {
16810 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
16812 bool IsIdentity =
true;
16814 Mask.swap(PrevMask);
16815 for (
unsigned I = 0;
I < NumScalars; ++
I) {
16817 DemandedElts.
setBit(InsertIdx);
16818 IsIdentity &= InsertIdx - OffsetBeg ==
I;
16819 Mask[InsertIdx - OffsetBeg] =
I;
16821 assert(
Offset < NumElts &&
"Failed to find vector index offset");
16835 InsertVecTy, Mask);
16837 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
16843 SmallBitVector InMask =
16845 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
16846 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
16847 if (InsertVecSz != VecSz) {
16852 for (
unsigned I = 0, End = OffsetBeg -
Offset;
I < End; ++
I)
16854 for (
unsigned I = OffsetBeg -
Offset, End = OffsetEnd -
Offset;
16858 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
16865 return Cost + SpillsReloads;
16867 case Instruction::ZExt:
16868 case Instruction::SExt:
16869 case Instruction::FPToUI:
16870 case Instruction::FPToSI:
16871 case Instruction::FPExt:
16872 case Instruction::PtrToInt:
16873 case Instruction::IntToPtr:
16874 case Instruction::SIToFP:
16875 case Instruction::UIToFP:
16876 case Instruction::Trunc:
16877 case Instruction::FPTrunc:
16878 case Instruction::BitCast: {
16879 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
16882 unsigned Opcode = ShuffleOrOp;
16883 unsigned VecOpcode = Opcode;
16885 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
16887 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
16888 if (SrcIt != MinBWs.end()) {
16889 SrcBWSz = SrcIt->second.first;
16895 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
16896 if (BWSz == SrcBWSz) {
16897 VecOpcode = Instruction::BitCast;
16898 }
else if (BWSz < SrcBWSz) {
16899 VecOpcode = Instruction::Trunc;
16900 }
else if (It != MinBWs.end()) {
16901 assert(BWSz > SrcBWSz &&
"Invalid cast!");
16902 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16903 }
else if (SrcIt != MinBWs.end()) {
16904 assert(BWSz > SrcBWSz &&
"Invalid cast!");
16906 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
16908 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
16909 !SrcIt->second.second) {
16910 VecOpcode = Instruction::UIToFP;
16913 assert(Idx == 0 &&
"Expected 0 index only");
16914 return TTI->getCastInstrCost(Opcode, VL0->
getType(),
16921 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
16923 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
16926 bool IsArithmeticExtendedReduction =
16927 E->Idx == 0 && UserIgnoreList &&
16930 return is_contained({Instruction::Add, Instruction::FAdd,
16931 Instruction::Mul, Instruction::FMul,
16932 Instruction::And, Instruction::Or,
16936 if (IsArithmeticExtendedReduction &&
16937 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
16939 return CommonCost +
16940 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,
CostKind,
16941 VecOpcode == Opcode ? VI :
nullptr);
16943 return GetCostDiff(GetScalarCost, GetVectorCost);
16945 case Instruction::FCmp:
16946 case Instruction::ICmp:
16953 case Instruction::Select: {
16954 CmpPredicate VecPred, SwappedVecPred;
16957 match(VL0, MatchCmp))
16963 auto GetScalarCost = [&](
unsigned Idx) {
16978 ShuffleOrOp == Instruction::Select &&
16993 "Expected same type for LHS/RHS");
16996 ScalarCost = TTI->getArithmeticInstrCost(
16998 getOperandInfo(
VI->getOperand(0)), getOperandInfo(
RHS));
17001 ScalarCost = TTI->getArithmeticInstrCost(
17003 getOperandInfo(
VI->getOperand(0)), getOperandInfo(
LHS));
17009 ScalarCost = TTI->getCmpSelInstrCost(
17010 E->getOpcode(), OrigScalarTy,
17015 VI->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17017 VI->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17031 auto *MaskTy =
getWidenedType(ShuffleOrOp == Instruction::Select
17037 if (ShuffleOrOp == Instruction::Select) {
17047 VecCost = TTI->getArithmeticInstrCost(
17048 Instruction::Or, VecTy,
CostKind, getOperandInfo(
Cond),
17049 getOperandInfo(
RHS));
17054 VecCost = TTI->getArithmeticInstrCost(
17055 Instruction::And, VecTy,
CostKind, getOperandInfo(
Cond),
17056 getOperandInfo(
LHS));
17061 VecCost = TTI->getCmpSelInstrCost(
17062 E->getOpcode(), VecTy, MaskTy, VecPred,
CostKind,
17064 E->getOperand(ShuffleOrOp == Instruction::Select ? 1 : 0)),
17066 E->getOperand(ShuffleOrOp == Instruction::Select ? 2 : 1)),
17071 assert(VecTyNumElements >= CondNumElements &&
17072 VecTyNumElements % CondNumElements == 0 &&
17073 "Cannot vectorize Instruction::Select");
17074 if (CondNumElements != VecTyNumElements) {
17084 return VecCost + CommonCost;
17086 return GetCostDiff(GetScalarCost, GetVectorCost);
17088 case TreeEntry::MinMax: {
17089 auto GetScalarCost = [&](
unsigned Idx) {
17090 return GetMinMaxCost(OrigScalarTy);
17094 return VecCost + CommonCost;
17096 return GetCostDiff(GetScalarCost, GetVectorCost);
17098 case TreeEntry::FMulAdd: {
17099 auto GetScalarCost = [&](
unsigned Idx) {
17102 return GetFMulAddCost(
E->getOperations(),
17108 for (
Value *V :
E->Scalars) {
17110 FMF &= FPCI->getFastMathFlags();
17112 FMF &= FPCIOp->getFastMathFlags();
17115 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
17116 {VecTy, VecTy, VecTy}, FMF);
17118 return VecCost + CommonCost;
17120 return GetCostDiff(GetScalarCost, GetVectorCost);
17122 case TreeEntry::ReducedBitcast:
17123 case TreeEntry::ReducedBitcastBSwap: {
17124 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
17134 ScalarCost += TTI.getInstructionCost(ZExt,
CostKind);
17138 const TreeEntry *LhsTE = getOperandEntry(
E, 0);
17140 getCastContextHint(*getOperandEntry(LhsTE, 0));
17142 auto *SrcVecTy =
getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
17144 Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
CostKind);
17145 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
17146 auto *SrcType = IntegerType::getIntNTy(
17148 DL->getTypeSizeInBits(SrcScalarTy) * EntryVF);
17149 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17151 TTI.getIntrinsicInstrCost(CostAttrs,
CostKind);
17153 if (SrcType != ScalarTy) {
17155 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17159 return BitcastCost + CommonCost;
17161 return GetCostDiff(GetScalarCost, GetVectorCost);
17163 case TreeEntry::ReducedBitcastLoads:
17164 case TreeEntry::ReducedBitcastBSwapLoads: {
17165 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
17175 ScalarCost += TTI.getInstructionCost(ZExt,
CostKind);
17179 ScalarCost += TTI.getInstructionCost(Load,
CostKind);
17183 const TreeEntry *LhsTE = getOperandEntry(
E, 0);
17184 const TreeEntry *LoadTE = getOperandEntry(LhsTE, 0);
17186 auto *SrcType = IntegerType::getIntNTy(
17188 DL->getTypeSizeInBits(LI0->getType()) * EntryVF);
17190 TTI.getMemoryOpCost(Instruction::Load, SrcType, LI0->getAlign(),
17191 LI0->getPointerAddressSpace(),
CostKind);
17192 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
17193 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
17195 TTI.getIntrinsicInstrCost(CostAttrs,
CostKind);
17197 if (SrcType != ScalarTy) {
17199 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
17203 return LoadCost + CommonCost;
17205 return GetCostDiff(GetScalarCost, GetVectorCost);
17207 case TreeEntry::ReducedCmpBitcast: {
17208 auto GetScalarCost = [&, &TTI = *TTI](
unsigned Idx) {
17220 IntegerType::getIntNTy(ScalarTy->
getContext(),
E->getVectorFactor());
17222 TTI.getCastInstrCost(Instruction::BitCast, DstTy, CmpTy,
17224 if (DstTy != ScalarTy) {
17226 TTI.getCastInstrCost(Instruction::ZExt, ScalarTy, DstTy,
17229 return BitcastCost + CommonCost;
17231 return GetCostDiff(GetScalarCost, GetVectorCost);
17233 case Instruction::FNeg:
17234 case Instruction::Add:
17235 case Instruction::FAdd:
17236 case Instruction::Sub:
17237 case Instruction::FSub:
17238 case Instruction::Mul:
17239 case Instruction::FMul:
17240 case Instruction::UDiv:
17241 case Instruction::SDiv:
17242 case Instruction::FDiv:
17243 case Instruction::URem:
17244 case Instruction::SRem:
17245 case Instruction::FRem:
17246 case Instruction::Shl:
17247 case Instruction::LShr:
17248 case Instruction::AShr:
17249 case Instruction::And:
17250 case Instruction::Or:
17251 case Instruction::Xor: {
17252 auto GetScalarCost = [&](
unsigned Idx) {
17259 unsigned Lane = UniqueIndexes[Idx];
17260 Value *Op1 =
E->getOperand(0)[Lane];
17262 SmallVector<const Value *, 2> Operands(1, Op1);
17266 Op2 =
E->getOperand(1)[Lane];
17272 ShuffleOrOp, OrigScalarTy,
CostKind, Op1Info, Op2Info, Operands);
17274 I && (ShuffleOrOp == Instruction::FAdd ||
17275 ShuffleOrOp == Instruction::FSub)) {
17283 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
17288 return CI && CI->getValue().countr_one() >= It->second.first;
17296 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,
CostKind, Op1Info,
17297 Op2Info, {},
nullptr, TLI) +
17300 return GetCostDiff(GetScalarCost, GetVectorCost);
17302 case Instruction::GetElementPtr: {
17303 return CommonCost + GetGEPCostDiff(VL, VL0);
17305 case Instruction::Load: {
17306 auto GetScalarCost = [&](
unsigned Idx) {
17308 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
17309 VI->getAlign(),
VI->getPointerAddressSpace(),
17315 switch (
E->State) {
17316 case TreeEntry::Vectorize:
17317 if (
unsigned Factor =
E->getInterleaveFactor()) {
17318 VecLdCost = TTI->getInterleavedMemoryOpCost(
17319 Instruction::Load, VecTy, Factor, {}, LI0->getAlign(),
17320 LI0->getPointerAddressSpace(),
CostKind);
17323 VecLdCost = TTI->getMemoryOpCost(
17324 Instruction::Load, VecTy, LI0->getAlign(),
17328 case TreeEntry::StridedVectorize: {
17330 FixedVectorType *StridedLoadTy = SPtrInfo.
Ty;
17331 assert(StridedLoadTy &&
"Missing StridedPointerInfo for tree entry.");
17332 Align CommonAlignment =
17334 VecLdCost = TTI->getMemIntrinsicInstrCost(
17335 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
17336 StridedLoadTy, LI0->getPointerOperand(),
17337 false, CommonAlignment),
17339 if (StridedLoadTy != VecTy)
17341 TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy,
17346 case TreeEntry::CompressVectorize: {
17348 unsigned InterleaveFactor;
17349 SmallVector<int> CompressMask;
17352 if (!
E->ReorderIndices.empty()) {
17353 SmallVector<int>
Mask(
E->ReorderIndices.begin(),
17354 E->ReorderIndices.end());
17361 Scalars, PointerOps,
E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
17362 *TLI, [](
Value *) { return true; }, IsMasked, InterleaveFactor,
17363 CompressMask, LoadVecTy);
17364 CompressEntryToData.try_emplace(
E, CompressMask, LoadVecTy,
17365 InterleaveFactor, IsMasked);
17366 Align CommonAlignment = LI0->getAlign();
17367 if (InterleaveFactor) {
17368 VecLdCost = TTI->getInterleavedMemoryOpCost(
17369 Instruction::Load, LoadVecTy, InterleaveFactor, {},
17370 CommonAlignment, LI0->getPointerAddressSpace(),
CostKind);
17371 }
else if (IsMasked) {
17372 VecLdCost = TTI->getMemIntrinsicInstrCost(
17373 MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
17375 LI0->getPointerAddressSpace()),
17379 LoadVecTy, CompressMask,
CostKind);
17381 VecLdCost = TTI->getMemoryOpCost(
17382 Instruction::Load, LoadVecTy, CommonAlignment,
17386 LoadVecTy, CompressMask,
CostKind);
17390 case TreeEntry::ScatterVectorize: {
17391 Align CommonAlignment =
17393 VecLdCost = TTI->getMemIntrinsicInstrCost(
17394 MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
17395 LI0->getPointerOperand(),
17396 false, CommonAlignment),
17400 case TreeEntry::CombinedVectorize:
17401 case TreeEntry::SplitVectorize:
17402 case TreeEntry::NeedToGather:
17405 return VecLdCost + CommonCost;
17411 if (
E->State == TreeEntry::ScatterVectorize)
17418 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
17420 case Instruction::Store: {
17421 bool IsReorder = !
E->ReorderIndices.empty();
17422 auto GetScalarCost = [=](
unsigned Idx) {
17425 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
17426 VI->getAlign(),
VI->getPointerAddressSpace(),
17434 if (
E->State == TreeEntry::StridedVectorize) {
17435 Align CommonAlignment =
17437 VecStCost = TTI->getMemIntrinsicInstrCost(
17438 MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
17439 VecTy, BaseSI->getPointerOperand(),
17440 false, CommonAlignment),
17443 assert(
E->State == TreeEntry::Vectorize &&
17444 "Expected either strided or consecutive stores.");
17445 if (
unsigned Factor =
E->getInterleaveFactor()) {
17446 assert(
E->ReuseShuffleIndices.empty() && !
E->ReorderIndices.empty() &&
17447 "No reused shuffles expected");
17449 VecStCost = TTI->getInterleavedMemoryOpCost(
17450 Instruction::Store, VecTy, Factor, {}, BaseSI->getAlign(),
17451 BaseSI->getPointerAddressSpace(),
CostKind);
17454 VecStCost = TTI->getMemoryOpCost(
17455 Instruction::Store, VecTy, BaseSI->getAlign(),
17456 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
17459 return VecStCost + CommonCost;
17463 unsigned Idx = IsReorder ?
E->ReorderIndices[
I] :
I;
17467 return GetCostDiff(GetScalarCost, GetVectorCost) +
17468 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
17470 case Instruction::Call: {
17471 auto GetScalarCost = [&](
unsigned Idx) {
17475 IntrinsicCostAttributes CostAttrs(
ID, *CI, 1);
17476 return TTI->getIntrinsicInstrCost(CostAttrs,
CostKind);
17486 CI,
ID, VecTy->getNumElements(),
17487 It != MinBWs.end() ? It->second.first : 0, TTI);
17489 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
17491 return GetCostDiff(GetScalarCost, GetVectorCost);
17493 case Instruction::ShuffleVector: {
17501 "Invalid Shuffle Vector Operand");
17504 auto TryFindNodeWithEqualOperands = [=]() {
17505 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17508 if (
TE->hasState() &&
TE->isAltShuffle() &&
17509 ((
TE->getOpcode() ==
E->getOpcode() &&
17510 TE->getAltOpcode() ==
E->getAltOpcode()) ||
17511 (
TE->getOpcode() ==
E->getAltOpcode() &&
17512 TE->getAltOpcode() ==
E->getOpcode())) &&
17513 TE->hasEqualOperands(*
E))
17518 auto GetScalarCost = [&](
unsigned Idx) {
17523 assert(
E->getMatchingMainOpOrAltOp(VI) &&
17524 "Unexpected main/alternate opcode");
17526 return TTI->getInstructionCost(VI,
CostKind);
17534 if (TryFindNodeWithEqualOperands()) {
17536 dbgs() <<
"SLP: diamond match for alternate node found.\n";
17543 TTIRef.getArithmeticInstrCost(
E->getOpcode(), VecTy,
CostKind);
17545 TTIRef.getArithmeticInstrCost(
E->getAltOpcode(), VecTy,
CostKind);
17548 VecCost = TTIRef.getCmpSelInstrCost(
17549 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
17550 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17552 VecCost += TTIRef.getCmpSelInstrCost(
17553 E->getOpcode(), VecTy, MaskTy,
17555 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
17558 Type *SrcSclTy =
E->getMainOp()->getOperand(0)->getType();
17561 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
17562 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
17564 DL->getTypeSizeInBits(
E->getMainOp()->getOperand(0)->getType());
17565 if (SrcIt != MinBWs.end()) {
17566 SrcBWSz = SrcIt->second.first;
17570 if (BWSz <= SrcBWSz) {
17571 if (BWSz < SrcBWSz)
17573 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
17577 <<
"SLP: alternate extension, which should be truncated.\n";
17583 VecCost = TTIRef.getCastInstrCost(
E->getOpcode(), VecTy, SrcTy,
17586 TTIRef.getCastInstrCost(
E->getAltOpcode(), VecTy, SrcTy,
17589 SmallVector<int>
Mask;
17590 E->buildAltOpShuffleMask(
17591 [&](Instruction *
I) {
17592 assert(
E->getMatchingMainOpOrAltOp(
I) &&
17593 "Unexpected main/alternate opcode");
17604 unsigned Opcode0 =
E->getOpcode();
17605 unsigned Opcode1 =
E->getAltOpcode();
17606 SmallBitVector OpcodeMask(
17610 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
17612 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
17613 return AltVecCost < VecCost ? AltVecCost : VecCost;
17619 return GetCostDiff(
17624 "Not supported shufflevector usage.");
17626 unsigned SVNumElements =
17628 ->getNumElements();
17629 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
17630 for (
size_t I = 0, End = VL.
size();
I != End;
I += GroupSize) {
17635 "Not supported shufflevector usage.");
17638 [[maybe_unused]]
bool IsExtractSubvectorMask =
17639 SV->isExtractSubvectorMask(Index);
17640 assert(IsExtractSubvectorMask &&
17641 "Not supported shufflevector usage.");
17642 if (NextIndex != Index)
17644 NextIndex += SV->getShuffleMask().size();
17647 return ::getShuffleCost(
17653 return GetCostDiff(GetScalarCost, GetVectorCost);
17655 case Instruction::Freeze:
17662bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
17664 << VectorizableTree.size() <<
" is fully vectorizable .\n");
17666 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
17667 SmallVector<int>
Mask;
17668 return TE->isGather() &&
17670 [
this](
Value *V) { return EphValues.contains(V); }) &&
17672 TE->Scalars.size() < Limit ||
17673 (((
TE->hasState() &&
17674 TE->getOpcode() == Instruction::ExtractElement) ||
17677 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
17678 !
TE->isAltShuffle()) ||
17683 if (VectorizableTree.size() == 1 &&
17684 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
17685 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
17686 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
17688 AreVectorizableGathers(VectorizableTree[0].
get(),
17689 VectorizableTree[0]->Scalars.size()) &&
17690 VectorizableTree[0]->getVectorFactor() > 2)))
17693 if (VectorizableTree.size() != 2)
17700 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
17701 AreVectorizableGathers(VectorizableTree[1].
get(),
17702 VectorizableTree[0]->Scalars.size()))
17706 if (VectorizableTree[0]->
isGather() ||
17707 (VectorizableTree[1]->
isGather() &&
17708 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
17709 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
17710 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
17721 if (VectorizableTree.empty()) {
17722 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
17729 const unsigned TreeSize = VectorizableTree.size();
17730 const TreeEntry &Front = *VectorizableTree.front();
17731 const bool FrontIsGather = Front.isGather();
17732 const bool FrontHasState = Front.hasState();
17733 const unsigned FrontOpcode = FrontHasState ? Front.getOpcode() : 0u;
17737 constexpr unsigned Limit = 4;
17738 constexpr unsigned LargeTree = 20;
17739 constexpr unsigned LimitTreeSize = 36;
17744 if (!ForReduction) {
17747 if (TreeSize == 1 && FrontIsGather) {
17748 if (FrontHasState && FrontOpcode == Instruction::ExtractElement)
17754 all_of(VectorizableTree, [](
const std::unique_ptr<TreeEntry> &TE) {
17755 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
17759 FrontOpcode == Instruction::ExtractElement &&
17760 (Front.getVectorFactor() == 2 ||
17764 auto *I = dyn_cast<Instruction>(V);
17765 return !I || !areAllUsersVectorized(I, UserIgnoreList);
17771 VectorizableTree[1]->isGather() &&
17772 (VectorizableTree[1]->getVectorFactor() <= 2 ||
17773 !(
isSplat(VectorizableTree[1]->Scalars) ||
17780 (!ForReduction || Front.getVectorFactor() <= 2) &&
17782 [&](
const std::unique_ptr<TreeEntry> &TE) {
17783 return TE->isGather() && TE->getVectorFactor() <= Limit &&
17793 if (!ForReduction) {
17798 if (!ThresholdSet &&
17799 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17800 const bool IsGather = TE->isGather();
17801 const bool HasState = TE->hasState();
17802 const unsigned Op = HasState ? TE->getOpcode() : 0u;
17803 if (IsGather && (!HasState ||
Op != Instruction::ExtractElement) &&
17806 return HasState &&
Op == Instruction::PHI;
17812 if (ThresholdSet && TreeSize <= Limit) {
17813 bool HasVectorPhi =
false;
17814 auto Compatible = [&](
const std::unique_ptr<TreeEntry> &TE) {
17815 const bool IsGather = TE->isGather();
17816 const bool HasState = TE->hasState();
17817 const unsigned Op = HasState ? TE->getOpcode() : 0u;
17818 if (IsGather && (!HasState ||
Op != Instruction::ExtractElement) &&
17823 if (
Op == Instruction::InsertElement)
17825 if (
Op != Instruction::PHI)
17827 if (TE->State == TreeEntry::Vectorize)
17828 HasVectorPhi =
true;
17830 return isa<PoisonValue>(V) || MustGather.contains(V);
17833 if (
all_of(VectorizableTree, Compatible) && HasVectorPhi)
17838 if (ThresholdNonNegative) {
17839 const bool IsLargeTree = TreeSize >= LargeTree;
17840 bool HasSingleLoad =
false;
17841 if (
all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17842 const bool IsGather = TE->isGather();
17843 const bool HasState = TE->hasState();
17844 const unsigned Op = HasState ? TE->getOpcode() : 0u;
17848 const bool PrevLoad = HasSingleLoad;
17850 HasState && !IsGather &&
17851 (
Op == Instruction::Load || TE->hasCopyableElements()) &&
17852 (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty());
17854 if (
Op == Instruction::PHI)
17856 if (TE->getVectorFactor() <= Limit &&
17857 (
Op == Instruction::Store ||
17858 (
Op == Instruction::Load && !PrevLoad)))
17861 }
else if (HasState &&
Op == Instruction::PHI) {
17864 return IsGather && (!HasState ||
Op != Instruction::ExtractElement);
17869 if (TreeSize >= 5 && Front.getVectorFactor() <= 2 &&
17870 Front.Scalars.front()->getType()->isIntegerTy()) {
17871 bool VectorNodeFound =
false;
17872 bool AnyNonConst =
false;
17873 if (
all_of(VectorizableTree,
17874 [&](
const std::unique_ptr<TreeEntry> &TE) {
17875 if (TE->State == TreeEntry::Vectorize && TE->hasState()) {
17876 const unsigned Op = TE->getOpcode();
17877 if (
Op == Instruction::PHI ||
17878 !TE->ReorderIndices.empty())
17880 if (VectorNodeFound)
17882 VectorNodeFound =
true;
17889 return TE->isGather() ||
17890 TE->State == TreeEntry::SplitVectorize;
17900 auto IsBenignNode = [&](
const TreeEntry &TE) {
17901 if (TE.State == TreeEntry::SplitVectorize)
17903 const bool IsGather = TE.isGather();
17904 const bool HasState = TE.hasState();
17906 const unsigned Op = TE.getOpcode();
17907 if (
Op == Instruction::PHI)
17909 const unsigned ScalarsSize = TE.Scalars.size();
17910 if (TE.Idx == 0 && ScalarsSize == 2 &&
Op == Instruction::ICmp &&
17911 TreeSize > LimitTreeSize)
17913 if (ScalarsSize == 2 &&
17914 (!TE.ReuseShuffleIndices.empty() || !TE.ReorderIndices.empty() ||
17915 TE.isAltShuffle()))
17917 if (TE.hasCopyableElements() &&
17927 if (!ThresholdSet) {
17929 unsigned NumGathers = 0;
17930 if (
all_of(VectorizableTree,
17931 [&](
const std::unique_ptr<TreeEntry> &TE) {
17932 const bool IsGather = TE->isGather();
17933 if (!IsGather && TE->hasState()) {
17934 const unsigned Op = TE->getOpcode();
17935 if (
Op == Instruction::Load ||
Op == Instruction::Store) {
17942 return IsBenignNode(*TE);
17944 (StoreLoadNodes.
empty() ||
17945 (TreeSize > LimitTreeSize * StoreLoadNodes.
size() &&
17947 none_of(StoreLoadNodes, [&](
const TreeEntry *TE) {
17948 return TE->getOpcode() == Instruction::Store ||
17950 return !isa<LoadInst>(V) ||
17951 areAllUsersVectorized(cast<Instruction>(V));
17959 if (ThresholdNonNegative && TreeSize > LimitTreeSize) {
17960 const TreeEntry *VectorNode =
nullptr;
17961 if (
all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
17962 if (!TE->isGather() && TE->hasState() &&
17963 TE->State != TreeEntry::SplitVectorize &&
17964 TE->getOpcode() != Instruction::PHI) {
17967 VectorNode = TE.get();
17970 return IsBenignNode(*TE);
17977 if (ThresholdSet && TreeSize >= Limit &&
17978 Front.State == TreeEntry::SplitVectorize &&
17980 [](
const std::unique_ptr<TreeEntry> &TE) {
17981 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
17982 TE->UserTreeIndex.UserTE->Idx == 0;
17988 if (ThresholdSet && TreeSize > 2 && Front.State == TreeEntry::Vectorize &&
17989 FrontOpcode == Instruction::InsertElement &&
17990 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17991 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
17993 [](
const std::unique_ptr<TreeEntry> &TE) {
17994 return TE->isGather();
18006 if (isFullyVectorizableTinyTree(ForReduction))
18012 const bool IsAllowedSingleBVNode =
18013 TreeSize > 1 || (FrontHasState && !Front.isAltShuffle() &&
18014 FrontOpcode != Instruction::PHI &&
18015 FrontOpcode != Instruction::GetElementPtr &&
18017 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
18018 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
18019 return isa<ExtractElementInst, Constant>(V) ||
18020 (IsAllowedSingleBVNode &&
18021 !V->hasNUsesOrMore(UsesLimit) &&
18022 any_of(V->users(), IsaPred<InsertElementInst>));
18027 const TreeEntry &Back = *VectorizableTree.back();
18028 if (Back.isGather() && Back.hasState() && Back.isAltShuffle()) {
18029 const unsigned BackVF = Back.getVectorFactor();
18031 !Back.Scalars.front()->getType()->isVectorTy() &&
18032 TTI->getScalarizationOverhead(
18047 constexpr unsigned SmallTree = 3;
18048 if (VectorizableTree.front()->isNonPowOf2Vec() &&
18051 [](
const std::unique_ptr<TreeEntry> &TE) {
18052 return TE->isGather() && TE->hasState() &&
18053 TE->getOpcode() == Instruction::Load &&
18061 TreeEntry &E = *VectorizableTree[Idx];
18062 if (E.State == TreeEntry::SplitVectorize)
18066 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
18088 const TreeEntry *Root = VectorizableTree.front().get();
18089 if (Root->isGather())
18098 for (
const auto &TEPtr : VectorizableTree) {
18099 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
18100 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
18101 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
18102 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
18103 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
18104 ScalarOrPseudoEntries.
insert(TEPtr.get());
18107 if (!TEPtr->isGather()) {
18108 Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
18109 EntriesToLastInstruction.
try_emplace(TEPtr.get(), LastInst);
18110 LastInstructions.
insert(LastInst);
18112 if (TEPtr->UserTreeIndex)
18113 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(TEPtr.get());
18120 auto NoCallIntrinsicOrDoesNotReturn = [
this, &NoCallIntrinsicCache](
18125 if (CB->doesNotReturn())
18130 if (
II->isAssumeLikeIntrinsic())
18132 auto [It, Inserted] = NoCallIntrinsicCache.
try_emplace(
II);
18140 bool Res = IntrCost < CallCost;
18149 CheckedInstructions;
18150 unsigned Budget = 0;
18151 const unsigned BudgetLimit =
18156 "Expected instructions in same block.");
18157 if (
auto It = CheckedInstructions.
find(
Last);
18158 It != CheckedInstructions.
end()) {
18159 const Instruction *Checked = It->second.getPointer();
18160 const bool NoCallsInCachedRange = It->second.getInt() != 0;
18161 if (Checked ==
First)
18162 return NoCallsInCachedRange;
18176 ++
First->getIterator().getReverse(),
18178 Last->getIterator().getReverse();
18180 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
18185 CB && !NoCallIntrinsicOrDoesNotReturn(CB) && !
isVectorized(CB)) {
18186 for (
const Instruction *LastInst : LastInstsInRange)
18187 CheckedInstructions.
try_emplace(LastInst, &*PrevInstIt, 0);
18190 if (LastInstructions.
contains(&*PrevInstIt))
18191 LastInstsInRange.
push_back(&*PrevInstIt);
18200 const bool Completed = PrevInstIt == InstIt;
18201 const bool NoCallsInRange = Completed || Budget <= BudgetLimit;
18202 for (
const Instruction *LastInst : LastInstsInRange)
18204 LastInst, Completed ?
First : &*PrevInstIt, NoCallsInRange ? 1 : 0);
18205 return NoCallsInRange;
18207 auto AddCosts = [&](
const TreeEntry *
Op) {
18210 Type *ScalarTy =
Op->Scalars.front()->getType();
18211 auto It = MinBWs.find(
Op);
18212 if (It != MinBWs.end())
18215 uint64_t Scale = getScaleToLoopIterations(*
Op);
18216 InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(VecTy);
18217 KeepLiveCost *= Scale;
18218 Cost += KeepLiveCost;
18221 Cost -=
Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(ScalarTy) *
18229 ParentOpParentToPreds;
18233 auto LoopBodyHasCall = [&](
const Loop *L) {
18234 if (
auto It = LoopBodyHasNonVecCall.
find(L);
18235 It != LoopBodyHasNonVecCall.
end())
18242 if (!CB || NoCallIntrinsicOrDoesNotReturn(CB) ||
isVectorized(CB))
18253 auto Key = std::make_pair(Root, OpParent);
18254 if (
auto It = ParentOpParentToPreds.
find(
Key);
18255 It != ParentOpParentToPreds.
end())
18263 const Loop *L = LI->getLoopFor(Root);
18264 const Loop *Outermost =
nullptr;
18265 while (L && !L->contains(OpParent)) {
18267 L = L->getParentLoop();
18269 if (Outermost && LoopBodyHasCall(Outermost))
18298 while (!Worklist.
empty()) {
18300 if (BB == OpParent) {
18304 if (!Visited.
insert(BB).second)
18309 if (DT->properlyDominates(Root, BB))
18311 auto Pair = std::make_pair(BB, OpParent);
18312 if (
auto It = ParentOpParentToPreds.
find(Pair);
18313 It != ParentOpParentToPreds.
end()) {
18327 if (Budget > BudgetLimit)
18340 auto FindNonScalarParentEntry = [&](
const TreeEntry *E) ->
const TreeEntry * {
18342 "Expected scalar or pseudo entry.");
18343 const TreeEntry *Entry = E;
18344 while (Entry->UserTreeIndex) {
18345 Entry = Entry->UserTreeIndex.UserTE;
18346 if (!ScalarOrPseudoEntries.
contains(Entry))
18351 while (!LiveEntries.
empty()) {
18353 const auto OpIt = EntriesToOperands.
find(Entry);
18354 if (OpIt == EntriesToOperands.
end())
18357 if (Operands.
empty())
18359 if (ScalarOrPseudoEntries.
contains(Entry)) {
18360 Entry = FindNonScalarParentEntry(Entry);
18362 for (
const TreeEntry *
Op : Operands) {
18363 if (!
Op->isGather())
18369 Instruction *LastInst = EntriesToLastInstruction.
at(Entry);
18371 for (
const TreeEntry *
Op : Operands) {
18372 if (!
Op->isGather())
18376 if (Entry->State == TreeEntry::SplitVectorize ||
18377 (Entry->getOpcode() != Instruction::PHI &&
Op->isGather()) ||
18383 Pred = Phi->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
18386 if (
Op->isGather()) {
18387 assert(Entry->getOpcode() == Instruction::PHI &&
18388 "Expected phi node only.");
18390 ->getIncomingBlock(
Op->UserTreeIndex.EdgeIdx);
18392 for (
Value *V :
Op->Scalars) {
18403 OpLastInst = EntriesToLastInstruction.
at(
Op);
18407 if (OpParent == Parent) {
18408 if (Entry->getOpcode() == Instruction::PHI) {
18409 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
18413 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
18419 if (Entry->getOpcode() != Instruction::PHI &&
18420 !CheckForNonVecCallsInSameBlock(
18426 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
18432 if (!CheckPredecessors(Parent, Pred, OpParent)) {
18448 const auto *I1 = IE1;
18449 const auto *I2 = IE2;
18461 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
18464 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
18467 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
18474struct ValueSelect {
18475 template <
typename U>
18476 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
18479 template <
typename U>
18480 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
18498template <
typename T>
18504 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
18506 auto VMIt = std::next(ShuffleMask.begin());
18509 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
18511 if (!IsBaseUndef.
all()) {
18513 std::pair<T *, bool> Res =
18514 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
18516 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
18520 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
18522 [[maybe_unused]]
auto *V = ValueSelect::get<T *>(
Base);
18523 assert((!V || GetVF(V) == Mask.size()) &&
18524 "Expected base vector of VF number of elements.");
18525 Prev = Action(Mask, {
nullptr, Res.first});
18526 }
else if (ShuffleMask.size() == 1) {
18529 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
18535 Prev = Action(Mask, {ShuffleMask.begin()->first});
18539 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
18540 unsigned Vec2VF = GetVF(VMIt->first);
18541 if (Vec1VF == Vec2VF) {
18545 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
18548 Mask[
I] = SecMask[
I] + Vec1VF;
18551 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
18554 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
18556 std::pair<T *, bool> Res2 =
18557 ResizeAction(VMIt->first, VMIt->second,
false);
18559 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
18566 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
18569 Prev = Action(Mask, {Res1.first, Res2.first});
18571 VMIt = std::next(VMIt);
18573 [[maybe_unused]]
bool IsBaseNotUndef = !IsBaseUndef.
all();
18575 for (
auto E = ShuffleMask.end(); VMIt !=
E; ++VMIt) {
18577 std::pair<T *, bool> Res =
18578 ResizeAction(VMIt->first, VMIt->second,
false);
18580 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
18583 "Multiple uses of scalars.");
18584 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
18589 Prev = Action(Mask, {Prev, Res.first});
18601 << VectorizableTree.size() <<
".\n");
18605 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
18607 auto IsExternallyUsedV = [&](
Value *V) {
18608 auto [It, Inserted] = ExternalUseVCache.
try_emplace(V);
18612 if (V->hasOneUse() || V->getType()->isVoidTy()) {
18614 }
else if (V->hasNUsesOrMore(NumVectScalars)) {
18620 m_InsertElt(m_Value(), m_OneUse(m_CastOrSelf(m_Specific(I))),
18623 if (match(U, m_InsertElt(m_Value(), m_Specific(I), m_ConstantInt())))
18625 if (match(U, m_Store(m_OneUse(m_CastOrSelf(m_Specific(I))), m_Value())))
18627 if (match(U, m_Store(m_Specific(I), m_Value())))
18629 ArrayRef<TreeEntry *> Entries = getTreeEntries(U);
18630 if (Entries.empty() && !MustGather.contains(U))
18632 if (any_of(Entries,
18633 [&](TreeEntry *TE) { return DeletedNodes.contains(TE); }))
18635 return any_of(ValueToGatherNodes.lookup(U), [&](
const TreeEntry *TE) {
18636 return DeletedNodes.contains(TE);
18643 auto IsExternallyUsed = [&](
const TreeEntry &TE,
Value *V) {
18644 assert(TE.hasState() && !TE.isGather() &&
18645 TE.State != TreeEntry::SplitVectorize &&
"Expected vector node.");
18646 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
18648 return IsExternallyUsedV(V);
18655 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
18656 TreeEntry &TE = *Ptr;
18659 if (TE.State == TreeEntry::CombinedVectorize) {
18661 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
18662 << *TE.Scalars[0] <<
".\n";
18663 TE.dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
18664 NodesCosts.try_emplace(&TE);
18667 if (
TE.hasState() &&
18668 (
TE.isGather() ||
TE.State == TreeEntry::SplitVectorize)) {
18669 if (
const TreeEntry *
E =
18670 getSameValuesTreeEntry(
TE.getMainOp(),
TE.Scalars);
18671 E &&
E->getVectorFactor() ==
TE.getVectorFactor()) {
18676 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
18677 NodesCosts.try_emplace(&TE);
18684 assert((!
TE.isGather() ||
TE.Idx == 0 ||
TE.UserTreeIndex) &&
18685 "Expected gather nodes with users only.");
18688 uint64_t Scale = 0;
18689 bool CostIsFree =
C == 0;
18694 const bool IsGatherLike =
18695 TE.isGather() ||
TE.State == TreeEntry::SplitVectorize;
18696 if (!CostIsFree && !
TE.isGather() &&
TE.hasState()) {
18697 if (PrevVecParent ==
TE.getMainOp()->getParent()) {
18703 if (!CostIsFree && !Scale) {
18704 Scale = IsGatherLike ? getGatherNodeEffectiveScale(TE)
18705 : getScaleToLoopIterations(
TE);
18708 if (!
TE.isGather() &&
TE.hasState()) {
18709 PrevVecParent = TE.getMainOp()->getParent();
18714 NodesCosts.try_emplace(&TE,
C);
18717 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
18719 if (
TE.Idx > 0 && !
TE.UserTreeIndex &&
TE.hasState() &&
18720 TE.getOpcode() == Instruction::Load)
18721 GatheredLoadsNodes.insert(&TE);
18722 if (!
TE.isGather() &&
TE.State != TreeEntry::SplitVectorize &&
18723 !(
TE.Idx == 0 && (
TE.getOpcode() == Instruction::InsertElement ||
18724 TE.getOpcode() == Instruction::Store))) {
18727 for (
Value *V :
TE.Scalars) {
18728 if (IsExternallyUsed(TE, V))
18729 DemandedElts.
setBit(
TE.findLaneForValue(V));
18731 if (!DemandedElts.
isZero()) {
18732 Type *ScalarTy =
TE.Scalars.front()->getType();
18733 auto It = MinBWs.find(&TE);
18734 if (It != MinBWs.end())
18738 *TTI, ScalarTy, VecTy, DemandedElts,
false,
18740 if (ExtCost.
isValid() && ExtCost != 0) {
18742 Scale = getScaleToLoopIterations(TE);
18746 ExtractCosts.try_emplace(&TE, ExtCost);
18755 constexpr unsigned PartLimit = 2;
18756 const unsigned Sz =
18758 const unsigned MinVF =
getMinVF(Sz);
18760 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
18761 (!VectorizableTree.front()->hasState() ||
18762 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
18763 LI->getLoopFor(VectorizableTree.front()->getMainOp()->getParent()))))
18770 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
18771 SubtreeCosts(VectorizableTree.size());
18772 auto UpdateParentNodes =
18773 [&](
const TreeEntry *UserTE,
const TreeEntry *
TE,
18775 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
18777 bool AddToList =
true) {
18779 VisitedUser.insert(std::make_pair(TE, UserTE)).second) {
18780 std::get<0>(SubtreeCosts[UserTE->Idx]) += TotalCost;
18781 std::get<1>(SubtreeCosts[UserTE->Idx]) +=
Cost;
18783 std::get<2>(SubtreeCosts[UserTE->Idx]).push_back(
TE->Idx);
18784 UserTE = UserTE->UserTreeIndex.UserTE;
18787 for (
const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
18788 TreeEntry &
TE = *Ptr;
18791 std::get<0>(SubtreeCosts[
TE.Idx]) +=
C + ExtractCost;
18792 std::get<1>(SubtreeCosts[
TE.Idx]) +=
C;
18793 if (
const TreeEntry *UserTE =
TE.UserTreeIndex.UserTE) {
18794 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
18796 UpdateParentNodes(UserTE, &TE,
C + ExtractCost,
C, VisitedUser);
18799 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
18800 for (TreeEntry *TE : GatheredLoadsNodes) {
18803 for (
Value *V :
TE->Scalars) {
18804 for (
const TreeEntry *BVTE : ValueToGatherNodes.lookup(V))
18805 UpdateParentNodes(BVTE, TE, TotalCost,
Cost, Visited,
18810 using CostIndicesTy =
18812 SmallVector<unsigned>>>;
18813 struct FirstGreater {
18814 bool operator()(
const CostIndicesTy &
LHS,
const CostIndicesTy &
RHS)
const {
18815 return std::get<0>(
LHS.second) < std::get<0>(
RHS.second) ||
18816 (std::get<0>(
LHS.second) == std::get<0>(
RHS.second) &&
18817 LHS.first->Idx <
RHS.first->Idx);
18820 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
18822 for (
const auto [Idx,
P] :
enumerate(SubtreeCosts))
18823 Worklist.emplace(VectorizableTree[Idx].
get(),
P);
18826 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
18827 VectorizableTree.front()->hasState() &&
18828 VectorizableTree.front()->getOpcode() == Instruction::Store &&
18829 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
18833 bool PreferTrimmedTree =
false;
18834 while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) {
18835 TreeEntry *
TE = Worklist.top().first;
18836 if (
TE->isGather() ||
TE->Idx == 0 || DeletedNodes.contains(TE) ||
18839 (
TE->UserTreeIndex &&
18840 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
18842 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
18843 return Entries.size() > 1;
18849 if (
TE->State == TreeEntry::Vectorize && !
TE->isAltShuffle() &&
18850 (
TE->getOpcode() == Instruction::ICmp ||
18851 TE->getOpcode() == Instruction::FCmp) &&
18853 auto *I = dyn_cast<CmpInst>(V);
18856 return I->getPredicate() !=
18857 cast<CmpInst>(TE->getMainOp())->getPredicate();
18864 InstructionCost TotalSubtreeCost = std::get<0>(Worklist.top().second);
18866 if (TotalSubtreeCost < TE->Scalars.size()) {
18870 if (!TransformedToGatherNodes.empty()) {
18871 for (
unsigned Idx : std::get<2>(Worklist.top().second)) {
18872 auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].
get());
18873 if (It != TransformedToGatherNodes.end()) {
18874 TotalSubtreeCost -= std::get<0>(SubtreeCosts[Idx]);
18875 SubtreeCost -= std::get<1>(SubtreeCosts[Idx]);
18876 TotalSubtreeCost += It->second;
18877 SubtreeCost += It->second;
18881 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
18885 const unsigned EntryVF =
TE->getVectorFactor();
18889 DemandedElts.
setBit(Idx);
18893 auto It = MinBWs.find(TE);
18894 if (It != MinBWs.end())
18898 *TTI, ScalarTy, VecTy, DemandedElts,
18900 SmallVector<int>
Mask;
18901 if (!
TE->ReorderIndices.empty() &&
18902 TE->State != TreeEntry::CompressVectorize &&
18903 (
TE->State != TreeEntry::StridedVectorize ||
18905 SmallVector<int> NewMask;
18906 if (
TE->getOpcode() == Instruction::Store) {
18908 NewMask.
resize(
TE->ReorderIndices.size());
18915 if (!
TE->ReuseShuffleIndices.empty())
18922 if ((!
TE->hasState() || !
TE->isAltShuffle()) &&
18924 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
18925 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
18929 ArrayRef<unsigned> Nodes = std::get<2>(Worklist.top().second);
18935 auto IsEqualCostAltShuffleToTrim = [&]() {
18936 return TotalSubtreeCost == GatherCost &&
TE->hasState() &&
18938 none_of(Nodes, [&](
unsigned Idx) {
18939 return VectorizableTree[Idx]->hasState() &&
18940 VectorizableTree[Idx]->getOpcode() ==
18941 Instruction::ExtractElement;
18946 bool HasNonPowerOf2 =
any_of(Nodes, [&](
unsigned Idx) {
18950 if (TotalSubtreeCost > GatherCost + TrimMargin ||
18951 IsEqualCostAltShuffleToTrim()) {
18952 PreferTrimmedTree |= TotalSubtreeCost == GatherCost;
18955 if (VectorizableTree.front()->hasState() &&
18956 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
18958 return InstructionCost::getInvalid();
18960 LLVM_DEBUG(
dbgs() <<
"SLP: Trimming unprofitable subtree at node "
18961 <<
TE->Idx <<
" with cost "
18962 << std::get<0>(Worklist.top().second)
18963 <<
" and gather cost " << GatherCost <<
".\n");
18964 if (
TE->UserTreeIndex) {
18965 TransformedToGatherNodes.try_emplace(TE, GatherCost);
18966 NodesCosts.erase(TE);
18968 DeletedNodes.insert(TE);
18969 TransformedToGatherNodes.erase(TE);
18970 NodesCosts.erase(TE);
18972 for (
unsigned Idx : Nodes) {
18973 TreeEntry &ChildTE = *VectorizableTree[Idx];
18974 DeletedNodes.insert(&ChildTE);
18975 TransformedToGatherNodes.erase(&ChildTE);
18976 NodesCosts.erase(&ChildTE);
18983 return std::get<1>(SubtreeCosts.front());
18985 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
18992 for (TreeEntry *TE : GatheredLoadsNodes) {
18993 if (DeletedNodes.contains(TE) || TransformedToGatherNodes.contains(TE))
18995 GatheredLoadsToDelete.
insert(TE);
18998 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
18999 for (
Value *V :
TE->Scalars) {
19000 unsigned Pos =
TE->findLaneForValue(V);
19001 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
19002 if (DeletedNodes.contains(BVE))
19004 DemandedElts.
setBit(Pos);
19005 ValuesToInsert.
try_emplace(BVE).first->second.push_back(V);
19008 if (!DemandedElts.
isZero()) {
19009 Type *ScalarTy =
TE->Scalars.front()->getType();
19010 auto It = MinBWs.find(TE);
19011 if (It != MinBWs.end())
19015 *TTI, ScalarTy, VecTy, DemandedElts,
19018 for (
const auto &[BVE, Values] : ValuesToInsert) {
19022 for (
Value *V : Values) {
19023 unsigned Pos = BVE->findLaneForValue(V);
19025 BVDemandedElts.
setBit(Pos);
19027 auto *BVVecTy =
getWidenedType(ScalarTy, BVE->getVectorFactor());
19029 *TTI, ScalarTy, BVVecTy, BVDemandedElts,
19033 if (ExtractsCost < BVCost) {
19034 LoadsExtractsCost += ExtractsCost;
19035 GatheredLoadsToDelete.
erase(TE);
19038 LoadsExtractsCost += BVCost;
19040 NodesCosts.erase(TE);
19044 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19045 if (
TE->UserTreeIndex &&
19046 GatheredLoadsToDelete.
contains(
TE->UserTreeIndex.UserTE)) {
19047 DeletedNodes.insert(
TE.get());
19048 NodesCosts.erase(
TE.get());
19049 GatheredLoadsToDelete.
insert(
TE.get());
19051 if (GatheredLoadsToDelete.
contains(
TE.get()))
19052 DeletedNodes.insert(
TE.get());
19055 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
19056 if (!
TE->UserTreeIndex && TransformedToGatherNodes.contains(
TE.get())) {
19057 assert(
TE->getOpcode() == Instruction::Load &&
"Expected load only.");
19060 if (DeletedNodes.contains(
TE.get()))
19062 if (!NodesCosts.contains(
TE.get())) {
19064 getEntryCost(
TE.get(), VectorizedVals, CheckedExtracts);
19065 if (!
C.isValid() ||
C == 0) {
19066 NodesCosts.try_emplace(
TE.get(),
C);
19069 uint64_t Scale = EntryToScale.
lookup(
TE.get());
19071 const bool IsGatherLike =
19072 TE->isGather() ||
TE->State == TreeEntry::SplitVectorize;
19073 Scale = IsGatherLike ? getGatherNodeEffectiveScale(*
TE.get())
19074 : getScaleToLoopIterations(*
TE.
get());
19077 NodesCosts.try_emplace(
TE.get(),
C);
19081 LLVM_DEBUG(
dbgs() <<
"SLP: Recalculate costs after tree trimming.\n");
19083 for (
const auto &
P : NodesCosts) {
19084 NewCost +=
P.second;
19085 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " <<
P.second <<
" for bundle "
19088 <<
"SLP: Current total cost = " << NewCost <<
"\n");
19090 if (NewCost + LoadsExtractsCost >
Cost ||
19091 (!PreferTrimmedTree && NewCost + LoadsExtractsCost ==
Cost)) {
19092 DeletedNodes.clear();
19093 TransformedToGatherNodes.clear();
19098 if (VectorizableTree.size() >= 2 && VectorizableTree.front()->hasState() &&
19099 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19100 TransformedToGatherNodes.contains(VectorizableTree[1].get()))
19101 return InstructionCost::getInvalid();
19102 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
19103 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
19104 VectorizableTree[1]->hasState() &&
19105 VectorizableTree[1]->State == TreeEntry::Vectorize &&
19106 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
19107 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
19108 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
19109 TransformedToGatherNodes.contains(VectorizableTree[2].get()))
19110 return InstructionCost::getInvalid();
19118template <
typename T>
struct ShuffledInsertData {
19122 MapVector<T, SmallVector<int>> ValueMasks;
19139 VectorizableTree.front()->Scalars.front()->getType()))) {
19140 unsigned NumScalar = getNumScalarInsts();
19141 unsigned NumVector = getNumVectorInsts();
19142 LLVM_DEBUG(
dbgs() <<
"SLP: Inst count check: vector=" << NumVector
19143 <<
" scalar=" << NumScalar <<
"\n");
19144 if (NumVector > NumScalar) {
19146 << NumVector <<
" > scalar inst count " << NumScalar
19157 if (!
C.isValid() ||
C == 0)
19160 EntryToScale.
try_emplace(std::make_tuple(&TE, Scalar, U), 0)
19161 .first->getSecond();
19163 Scale = getScaleToLoopIterations(TE, Scalar, U);
19164 LLVM_DEBUG(
dbgs() <<
"Scale " << Scale <<
" For entry " << TE.Idx <<
"\n");
19168 if (UserIgnoreList) {
19170 ReductionCost = ScaleCost(ReductionCost, *VectorizableTree.front().get(),
19171 nullptr, ReductionRoot);
19175 Cost += ReductionCost;
19179 constexpr unsigned CostLimit = 100;
19181 (VectorizableTree.size() - DeletedNodes.size()) *
19182 VectorizableTree.front()->getVectorFactor() <
19187 none_of(ExternalUses, [](
const ExternalUser &EU) {
19198 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
19205 bool AllUsersGEPSWithStoresLoads =
true;
19206 SmallBitVector UsedLanes(VectorizableTree.front()->getVectorFactor());
19208 Type *UserScalarTy =
nullptr;
19209 for (ExternalUser &EU : ExternalUses) {
19210 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
19211 if (EU.E.Idx == 0) {
19212 UsedLanes.
set(EU.Lane);
19217 if (!UserScalarTy) {
19218 UserScalarTy = LocalTy;
19219 }
else if (UserScalarTy != LocalTy) {
19220 AllUsersGEPSWithStoresLoads =
false;
19225 AllUsersGEPSWithStoresLoads =
false;
19230 AllUsersGEPSWithStoresLoads &= UsedLanes.
all();
19232 for (ExternalUser &EU : ExternalUses) {
19233 LLVM_DEBUG(
dbgs() <<
"SLP: Computing cost for external use of TreeEntry "
19234 << EU.E.Idx <<
" in lane " << EU.Lane <<
"\n");
19236 else dbgs() <<
" User: nullptr\n");
19237 LLVM_DEBUG(
dbgs() <<
" Use: " << EU.Scalar->getNameOrAsOperand() <<
"\n");
19242 if (EphValues.count(EU.User))
19246 if (!CheckedScalarUser.
insert(std::make_pair(EU.Scalar, EU.User)).second ||
19248 CheckedScalarUser.
contains(std::make_pair(EU.Scalar,
nullptr))))
19256 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
19262 !ExtractCostCalculated.
insert(EU.Scalar).second)
19275 if (!UsedInserts.
insert(VU).second)
19279 const TreeEntry *ScalarTE = &EU.E;
19282 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
19287 Value *Op0 =
II->getOperand(0);
19294 if (It == ShuffledInserts.
end()) {
19296 Data.InsertElements.emplace_back(VU);
19298 VecId = ShuffledInserts.
size() - 1;
19299 auto It = MinBWs.find(ScalarTE);
19300 if (It != MinBWs.end() &&
19302 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
19304 unsigned BWSz = It->second.first;
19305 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
19306 unsigned VecOpcode;
19307 if (DstBWSz < BWSz)
19308 VecOpcode = Instruction::Trunc;
19311 It->second.second ? Instruction::SExt : Instruction::ZExt;
19316 FTy->getNumElements()),
19319 <<
" for extending externally used vector with "
19320 "non-equal minimum bitwidth.\n");
19325 It->InsertElements.front() = VU;
19326 VecId = std::distance(ShuffledInserts.
begin(), It);
19328 int InIdx = *InsertIdx;
19330 ShuffledInserts[VecId].ValueMasks[ScalarTE];
19333 Mask[InIdx] = EU.Lane;
19334 DemandedElts[VecId].setBit(InIdx);
19345 auto *ScalarTy = EU.Scalar->getType();
19346 const unsigned BundleWidth = EU.E.getVectorFactor();
19347 assert(EU.Lane < BundleWidth &&
"Extracted lane out of bounds.");
19349 const TreeEntry *Entry = &EU.E;
19350 auto It = MinBWs.find(Entry);
19351 if (It != MinBWs.end()) {
19356 ? Instruction::ZExt
19357 : Instruction::SExt;
19362 << ExtraCost <<
"\n");
19366 CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
19367 LLVM_DEBUG(
dbgs() <<
" ExtractElement cost for " << *ScalarTy <<
" from "
19368 << *VecTy <<
": " << ExtraCost <<
"\n");
19371 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
19372 Entry->getOpcode() == Instruction::Load) {
19374 auto IsPhiInLoop = [&](
const ExternalUser &U) {
19377 const Loop *L = LI->getLoopFor(Phi->getParent());
19378 return L && (Phi->getParent() ==
I->getParent() ||
19379 L == LI->getLoopFor(
I->getParent()));
19383 if (!ValueToExtUses) {
19384 ValueToExtUses.emplace();
19385 for (
const auto &
P :
enumerate(ExternalUses)) {
19387 if (IsPhiInLoop(
P.value()))
19390 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
19397 auto OperandIsScalar = [&](
Value *V) {
19403 return !EE->hasOneUse() || !MustGather.contains(EE);
19406 return ValueToExtUses->contains(V);
19408 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
19409 bool CanBeUsedAsScalarCast =
false;
19412 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
19417 if (ScalarCost + OpCost <= ExtraCost) {
19418 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
19419 ScalarCost += OpCost;
19423 if (CanBeUsedAsScalar) {
19424 bool KeepScalar = ScalarCost <= ExtraCost;
19428 bool IsProfitablePHIUser =
19430 VectorizableTree.front()->Scalars.size() > 2)) &&
19431 VectorizableTree.front()->hasState() &&
19432 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
19436 auto *PHIUser = dyn_cast<PHINode>(U);
19437 return (!PHIUser ||
19438 PHIUser->getParent() != VectorizableTree.front()
19444 return ValueToExtUses->contains(V);
19446 if (IsProfitablePHIUser) {
19450 (!GatheredLoadsEntriesFirst.has_value() ||
19451 Entry->Idx < *GatheredLoadsEntriesFirst)) {
19452 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
19453 return ValueToExtUses->contains(V);
19455 auto It = ExtractsCount.
find(Entry);
19456 if (It != ExtractsCount.
end()) {
19457 assert(ScalarUsesCount >= It->getSecond().size() &&
19458 "Expected total number of external uses not less than "
19459 "number of scalar uses.");
19460 ScalarUsesCount -= It->getSecond().size();
19465 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
19468 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
19469 for (
Value *V : Inst->operands()) {
19470 auto It = ValueToExtUses->find(V);
19471 if (It != ValueToExtUses->end()) {
19473 ExternalUses[It->second].User =
nullptr;
19476 ExtraCost = ScalarCost;
19477 if (!IsPhiInLoop(EU))
19478 ExtractsCount[Entry].
insert(Inst);
19479 if (CanBeUsedAsScalarCast) {
19480 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
19484 for (
Value *V : IOp->operands()) {
19485 auto It = ValueToExtUses->find(V);
19486 if (It != ValueToExtUses->end()) {
19488 ExternalUses[It->second].User =
nullptr;
19497 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
19500 ExtractCost += ExtraCost;
19509 if (AllUsersGEPSWithStoresLoads && !Pointers.
empty()) {
19510 const TreeEntry &RootEntry = *VectorizableTree.front();
19511 const bool AnyRootKeptAsScalar =
any_of(RootEntry.Scalars, [&](
Value *V) {
19512 return ExternalUsesAsOriginalScalar.contains(V);
19514 const Value *CommonBase =
nullptr;
19515 bool HaveCommonBase =
true;
19516 for (
const Value *
P : Pointers) {
19520 else if (CommonBase !=
Op) {
19521 HaveCommonBase =
false;
19525 if (!AnyRootKeptAsScalar && HaveCommonBase) {
19527 auto *VecTy =
getWidenedType(UserScalarTy, RootEntry.Scalars.size());
19529 Pointers, CommonBase, TTI::PointersChainInfo::getUnitStride(),
19532 Pointers, CommonBase, TTI::PointersChainInfo::getUnknownStride(),
19534 ExtractCost += ScaleCost(VectorGEPCost - ScalarGEPCost, RootEntry);
19539 for (
Value *V : ScalarOpsFromCasts) {
19540 ExternalUsesAsOriginalScalar.insert(V);
19542 const auto *It =
find_if_not(TEs, [&](TreeEntry *TE) {
19543 return TransformedToGatherNodes.contains(TE) ||
19544 DeletedNodes.contains(TE);
19546 if (It != TEs.end()) {
19547 const TreeEntry *UserTE = *It;
19548 ExternalUses.emplace_back(V,
nullptr, *UserTE,
19549 UserTE->findLaneForValue(V));
19554 if (!VectorizedVals.
empty()) {
19555 const TreeEntry &Root = *VectorizableTree.front();
19556 auto BWIt = MinBWs.find(&Root);
19557 if (BWIt != MinBWs.end()) {
19558 Type *DstTy = Root.Scalars.front()->getType();
19559 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->
getScalarType());
19561 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
19562 if (OriginalSz != SrcSz) {
19563 unsigned Opcode = Instruction::Trunc;
19564 if (OriginalSz > SrcSz)
19565 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
19572 TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
19575 CastCost = ScaleCost(CastCost, Root,
nullptr, ReductionRoot);
19585 VectorizableTree[1]->hasState() &&
19586 VectorizableTree[1]->State == TreeEntry::Vectorize &&
19587 all_of(VectorizableTree[1]->Scalars, [&](
Value *V) {
19588 return ExternalUsesAsOriginalScalar.contains(V);
19592 Cost += ExtractCost;
19593 auto &&ResizeToVF = [
this, &Cost](
const TreeEntry *TE,
ArrayRef<int> Mask,
19594 bool ForSingleMask) {
19596 unsigned VF = Mask.size();
19597 unsigned VecVF = TE->getVectorFactor();
19598 bool HasLargeIndex =
19599 any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); });
19600 if ((VF != VecVF && HasLargeIndex) ||
19603 if (HasLargeIndex) {
19605 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
19611 dbgs() <<
"SLP: Adding cost " <<
C
19612 <<
" for final shuffle of insertelement external users.\n";
19613 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
19615 return std::make_pair(TE,
true);
19618 if (!ForSingleMask) {
19620 for (
unsigned I = 0;
I < VF; ++
I) {
19622 ResizeMask[Mask[
I]] = Mask[
I];
19629 dbgs() <<
"SLP: Adding cost " <<
C
19630 <<
" for final shuffle of insertelement external users.\n";
19631 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
19636 return std::make_pair(TE,
false);
19639 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
19640 Value *
Base = ShuffledInserts[
I].InsertElements.
front()->getOperand(0);
19641 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
19645 assert((TEs.size() == 1 || TEs.size() == 2) &&
19646 "Expected exactly 1 or 2 tree entries.");
19647 if (TEs.size() == 1) {
19649 VF = TEs.front()->getVectorFactor();
19650 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
19654 (
Data.index() < VF &&
19655 static_cast<int>(
Data.index()) ==
Data.value());
19659 C = ScaleCost(
C, *TEs.front());
19661 <<
" for final shuffle of insertelement "
19662 "external users.\n";
19663 TEs.front()->
dump();
19664 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
19670 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
19671 VF = TEs.front()->getVectorFactor();
19675 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
19678 C = ScaleCost(
C, *TEs.back());
19680 <<
" for final shuffle of vector node and external "
19681 "insertelement users.\n";
19682 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
19683 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
19691 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
19692 EstimateShufflesCost);
19695 ShuffledInserts[
I].InsertElements.
front()->getType()),
19698 Cost -= InsertCost;
19702 if (ReductionBitWidth != 0) {
19703 assert(UserIgnoreList &&
"Expected reduction tree.");
19704 const TreeEntry &E = *VectorizableTree.front();
19705 auto It = MinBWs.find(&E);
19706 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
19707 unsigned SrcSize = It->second.first;
19708 unsigned DstSize = ReductionBitWidth;
19709 unsigned Opcode = Instruction::Trunc;
19710 if (SrcSize < DstSize) {
19711 bool IsArithmeticExtendedReduction =
19714 return is_contained({Instruction::Add, Instruction::FAdd,
19715 Instruction::Mul, Instruction::FMul,
19716 Instruction::And, Instruction::Or,
19720 if (IsArithmeticExtendedReduction)
19722 Instruction::BitCast;
19724 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
19726 if (Opcode != Instruction::BitCast) {
19728 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
19730 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
19732 switch (E.getOpcode()) {
19733 case Instruction::SExt:
19734 case Instruction::ZExt:
19735 case Instruction::Trunc: {
19736 const TreeEntry *OpTE = getOperandEntry(&E, 0);
19737 CCH = getCastContextHint(*OpTE);
19744 TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
19746 CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
19747 nullptr, ReductionRoot);
19750 <<
" for final resize for reduction from " << SrcVecTy
19751 <<
" to " << DstVecTy <<
"\n";
19752 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
19757 std::optional<InstructionCost> SpillCost;
19760 Cost += *SpillCost;
19766 OS <<
"SLP: Spill Cost = ";
19771 OS <<
".\nSLP: Extract Cost = " << ExtractCost <<
".\n";
19773 OS <<
"SLP: Reduction Cost = " << ReductionCost <<
".\n";
19774 OS <<
"SLP: Total Cost = " << Cost <<
".\n";
19778 ViewGraph(
this,
"SLP" + F->getName(),
false, Str);
19789std::optional<TTI::ShuffleKind>
19790BoUpSLP::tryToGatherSingleRegisterExtractElements(
19796 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
19812 if (Idx >= VecTy->getNumElements()) {
19816 SmallBitVector ExtractMask(VecTy->getNumElements(),
true);
19817 ExtractMask.reset(*Idx);
19824 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
19829 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
19830 return P1.second.size() >
P2.second.size();
19833 const int UndefSz = UndefVectorExtracts.
size();
19834 unsigned SingleMax = 0;
19835 unsigned PairMax = 0;
19836 if (!Vectors.
empty()) {
19837 SingleMax = Vectors.
front().second.size() + UndefSz;
19838 if (Vectors.
size() > 1) {
19839 auto *ItNext = std::next(Vectors.
begin());
19840 PairMax = SingleMax + ItNext->second.size();
19843 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
19844 return std::nullopt;
19850 if (SingleMax >= PairMax && SingleMax) {
19851 for (
int Idx : Vectors.
front().second)
19852 std::swap(GatheredExtracts[Idx], VL[Idx]);
19853 }
else if (!Vectors.
empty()) {
19854 for (
unsigned Idx : {0, 1})
19855 for (
int Idx : Vectors[Idx].second)
19856 std::swap(GatheredExtracts[Idx], VL[Idx]);
19859 for (
int Idx : UndefVectorExtracts)
19860 std::swap(GatheredExtracts[Idx], VL[Idx]);
19863 std::optional<TTI::ShuffleKind> Res =
19869 return std::nullopt;
19873 for (
int I = 0,
E = GatheredExtracts.size();
I <
E; ++
I) {
19894BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
19895 SmallVectorImpl<int> &Mask,
19896 unsigned NumParts)
const {
19897 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
19904 const unsigned PartOffset = Part * SliceSize;
19907 if (PartOffset + PartSize > VL.
size())
19911 SmallVector<int> SubMask;
19912 std::optional<TTI::ShuffleKind> Res =
19913 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
19914 ShufflesRes[Part] = Res;
19915 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
19916 if (SubVL.
size() != SliceSize)
19919 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
19920 return Res.has_value();
19922 ShufflesRes.clear();
19923 return ShufflesRes;
19926std::optional<TargetTransformInfo::ShuffleKind>
19927BoUpSLP::isGatherShuffledSingleRegisterEntry(
19929 SmallVectorImpl<const TreeEntry *> &Entries,
unsigned Part,
bool ForOrder,
19930 unsigned SliceSize) {
19933 return std::nullopt;
19934 const unsigned MaskBase = Part * SliceSize;
19937 auto GetUserEntry = [&](
const TreeEntry *
TE) {
19938 while (
TE->UserTreeIndex &&
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
19939 TE =
TE->UserTreeIndex.UserTE;
19940 if (TE == VectorizableTree.front().get())
19941 return EdgeInfo(
const_cast<TreeEntry *
>(TE), 0);
19942 return TE->UserTreeIndex;
19944 auto HasGatherUser = [&](
const TreeEntry *
TE) {
19945 while (
TE->Idx != 0 &&
TE->UserTreeIndex) {
19946 if (
TE->UserTreeIndex.EdgeIdx == UINT_MAX)
19948 TE =
TE->UserTreeIndex.UserTE;
19952 const EdgeInfo TEUseEI = GetUserEntry(TE);
19953 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
19954 !TEUseEI.UserTE->hasState()))
19955 return std::nullopt;
19956 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
19961 TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() :
nullptr);
19962 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
19963 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
19966 TEInsertBlock = TEInsertPt->
getParent();
19968 if (!DT->isReachableFromEntry(TEInsertBlock))
19969 return std::nullopt;
19970 auto *NodeUI = DT->getNode(TEInsertBlock);
19971 assert(NodeUI &&
"Should only process reachable instructions");
19973 auto CheckOrdering = [&](
const Instruction *InsertPt) {
19986 const BasicBlock *InsertBlock = InsertPt->getParent();
19987 auto *NodeEUI = DT->getNode(InsertBlock);
19990 assert((NodeUI == NodeEUI) ==
19991 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
19992 "Different nodes should have different DFS numbers");
19994 if (TEInsertPt->
getParent() != InsertBlock &&
19995 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
19997 if (TEInsertPt->
getParent() == InsertBlock &&
20010 SmallDenseMap<Value *, int> UsedValuesEntry;
20011 SmallPtrSet<const Value *, 16> VisitedValue;
20012 bool IsReusedNodeFound =
false;
20013 auto CheckAndUseSameNode = [&](
const TreeEntry *TEPtr) {
20015 if (IsReusedNodeFound)
20017 if ((TEPtr->getVectorFactor() != VL.
size() &&
20018 TEPtr->Scalars.size() != VL.
size()) ||
20019 (!TEPtr->isSame(VL) && !TEPtr->isSame(
TE->Scalars)))
20021 IsReusedNodeFound =
20022 equal(
TE->Scalars, TEPtr->Scalars) &&
20023 equal(
TE->ReorderIndices, TEPtr->ReorderIndices) &&
20024 equal(
TE->ReuseShuffleIndices, TEPtr->ReuseShuffleIndices);
20027 for (
Value *V : VL) {
20034 auto CheckParentNodes = [&](
const TreeEntry *User1,
const TreeEntry *User2,
20035 unsigned EdgeIdx) {
20036 const TreeEntry *Ptr1 = User1;
20037 const TreeEntry *Ptr2 = User2;
20038 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
20041 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
20042 Ptr2 = Ptr2->UserTreeIndex.UserTE;
20045 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
20046 Ptr1 = Ptr1->UserTreeIndex.UserTE;
20047 if (
auto It = PtrToIdx.
find(Ptr1); It != PtrToIdx.
end())
20048 return Idx < It->second;
20054 std::optional<bool> TEInsertPtUsedOutsideBlock;
20055 auto IsTEInsertPtUsedOutsideBlock = [&] {
20056 if (!TEInsertPtUsedOutsideBlock)
20057 TEInsertPtUsedOutsideBlock =
20059 return *TEInsertPtUsedOutsideBlock;
20064 const bool TEUseEIInsertPtUsedOutside =
20065 TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
20066 !TEUseEI.UserTE->isCopyableElement(
20068 IsTEInsertPtUsedOutsideBlock();
20069 auto CheckNonSchedulableOrdering = [&](
const TreeEntry *
E,
20071 return TEUseEIInsertPtUsedOutside &&
20072 InsertPt->getNextNode() == TEInsertPt &&
20073 (!
E->hasCopyableElements() || !
E->isCopyableElement(InsertPt) ||
20079 const bool TEUserNeedsEmitFirst =
20080 TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20081 TEUseEI.UserTE->hasState() &&
20082 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
20083 TEUseEI.UserTE->isAltShuffle()) &&
20087 SmallDenseMap<const TreeEntry *, bool> ScalarsUsedOutsideBlockCache;
20088 auto AllScalarsUsedOutsideBlock = [&](
const TreeEntry *UserTE) {
20096 for (
Value *V : VL) {
20100 SmallPtrSet<const TreeEntry *, 4> VToTEs;
20102 ValueToGatherNodes.lookup(V).takeVector());
20103 if (TransformedToGatherNodes.contains(TE)) {
20104 for (TreeEntry *
E : getSplitTreeEntries(V)) {
20105 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
20106 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
20108 GatherNodes.push_back(
E);
20110 for (TreeEntry *
E : getTreeEntries(V)) {
20111 if (TE ==
E || !TransformedToGatherNodes.contains(
E) ||
20112 !
E->UserTreeIndex ||
E->UserTreeIndex.UserTE->isGather())
20114 GatherNodes.push_back(
E);
20117 for (
const TreeEntry *TEPtr : GatherNodes) {
20118 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
20121 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
20122 "Must contain at least single gathered value.");
20123 assert(TEPtr->UserTreeIndex &&
20124 "Expected only single user of a gather node.");
20125 if (
any_of(TEPtr->CombinedEntriesWithIndices,
20126 [&](
const auto &
P) { return P.first == TE->Idx; }))
20128 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
20130 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
20131 UseEI.UserTE->hasState())
20136 : &getLastInstructionInBundle(UseEI.UserTE);
20137 if (TEInsertPt == InsertPt) {
20139 if (TEUserNeedsEmitFirst) {
20140 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
20141 (UseEI.UserTE->hasState() &&
20142 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20143 !UseEI.UserTE->isAltShuffle()) ||
20144 !AllScalarsUsedOutsideBlock(UseEI.UserTE))
20152 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
20155 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
20156 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
20157 UseEI.UserTE->State == TreeEntry::Vectorize &&
20158 UseEI.UserTE->getOpcode() == Instruction::PHI &&
20159 TEUseEI.UserTE != UseEI.UserTE)
20164 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
20168 if (TEUseEI.UserTE != UseEI.UserTE &&
20169 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
20170 HasGatherUser(TEUseEI.UserTE)))
20173 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
20177 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
20178 TEUseEI.UserTE->doesNotNeedToSchedule() !=
20179 UseEI.UserTE->doesNotNeedToSchedule() &&
20184 if ((TEInsertBlock != InsertPt->
getParent() ||
20185 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
20186 (!CheckOrdering(InsertPt) ||
20187 (UseEI.UserTE->hasCopyableElements() &&
20188 IsTEInsertPtUsedOutsideBlock() &&
20192 if (CheckAndUseSameNode(TEPtr))
20197 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
20202 const auto *It =
find_if(VTEs, [&](
const TreeEntry *MTE) {
20203 return MTE !=
TE && MTE != TEUseEI.UserTE &&
20204 !DeletedNodes.contains(MTE) &&
20205 !TransformedToGatherNodes.contains(MTE);
20207 if (It != VTEs.end()) {
20208 const TreeEntry *VTE = *It;
20209 if (
none_of(
TE->CombinedEntriesWithIndices,
20210 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
20211 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20212 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
20216 if (CheckAndUseSameNode(VTE))
20222 const auto *It =
find_if(VTEs, [&, MainTE = TE](
const TreeEntry *TE) {
20223 return TE != MainTE && !DeletedNodes.contains(TE) &&
20224 !TransformedToGatherNodes.contains(TE);
20226 if (It != VTEs.end()) {
20227 const TreeEntry *VTE = *It;
20228 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
20229 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
20230 VTEs = VTEs.drop_front();
20232 const auto *MIt =
find_if(VTEs, [](
const TreeEntry *MTE) {
20233 return MTE->State == TreeEntry::Vectorize;
20235 if (MIt == VTEs.end())
20239 if (
none_of(
TE->CombinedEntriesWithIndices,
20240 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
20241 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
20242 if (&LastBundleInst == TEInsertPt ||
20243 !CheckOrdering(&LastBundleInst) ||
20244 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
20248 if (CheckAndUseSameNode(VTE))
20253 if (IsReusedNodeFound)
20255 if (VToTEs.
empty())
20257 if (UsedTEs.
empty()) {
20265 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
20267 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
20271 if (!VToTEs.
empty()) {
20277 VToTEs = SavedVToTEs;
20282 if (Idx == UsedTEs.
size()) {
20286 if (UsedTEs.
size() == 2)
20288 UsedTEs.push_back(SavedVToTEs);
20289 Idx = UsedTEs.
size() - 1;
20295 if (UsedTEs.
empty()) {
20297 return std::nullopt;
20301 if (UsedTEs.
size() == 1) {
20304 UsedTEs.front().
end());
20305 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
20306 return TE1->Idx < TE2->Idx;
20309 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
20310 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
20312 if (It != FirstEntries.end() &&
20313 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
20314 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
20315 TE->ReuseShuffleIndices.size() == VL.size() &&
20316 (*It)->isSame(
TE->Scalars)))) {
20318 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
20319 std::iota(std::next(
Mask.begin(), MaskBase),
20320 std::next(
Mask.begin(), MaskBase + VL.size()), 0);
20322 SmallVector<int> CommonMask =
TE->getCommonMask();
20333 Entries.
push_back(FirstEntries.front());
20335 for (
auto &
P : UsedValuesEntry)
20337 VF = FirstEntries.front()->getVectorFactor();
20340 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
20342 DenseMap<int, const TreeEntry *> VFToTE;
20343 for (
const TreeEntry *TE : UsedTEs.front()) {
20344 unsigned VF =
TE->getVectorFactor();
20345 auto It = VFToTE.
find(VF);
20346 if (It != VFToTE.
end()) {
20347 if (It->second->Idx >
TE->Idx)
20348 It->getSecond() =
TE;
20355 UsedTEs.back().
end());
20356 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
20357 return TE1->Idx < TE2->Idx;
20359 for (
const TreeEntry *TE : SecondEntries) {
20360 auto It = VFToTE.
find(
TE->getVectorFactor());
20361 if (It != VFToTE.
end()) {
20370 if (Entries.
empty()) {
20372 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
20373 return TE1->Idx < TE2->Idx;
20375 Entries.
push_back(SecondEntries.front());
20376 VF = std::max(Entries.
front()->getVectorFactor(),
20377 Entries.
back()->getVectorFactor());
20379 VF = Entries.
front()->getVectorFactor();
20382 for (
const TreeEntry *
E : Entries)
20386 for (
auto &
P : UsedValuesEntry) {
20388 if (ValuesToEntries[Idx].
contains(
P.first)) {
20398 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
20405 for (
int I = 0,
E =
PHI->getNumIncomingValues();
I <
E; ++
I) {
20407 Value *In1 = PHI1->getIncomingValue(
I);
20425 SmallDenseMap<Value *, bool> MightBeIgnoredCache;
20426 auto MightBeIgnored = [=, &MightBeIgnoredCache](
Value *
V) {
20433 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
20440 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
20441 Value *V1 = VL[Idx];
20442 bool UsedInSameVTE =
false;
20443 auto It = UsedValuesEntry.find(V1);
20444 if (It != UsedValuesEntry.end())
20445 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
20446 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
20453 SmallBitVector UsedIdxs(Entries.size());
20455 for (
int I = 0,
E = VL.size();
I <
E; ++
I) {
20457 auto It = UsedValuesEntry.find(V);
20458 if (It == UsedValuesEntry.end())
20464 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
20465 (
I !=
E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
20467 unsigned Idx = It->second;
20474 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
20475 if (!UsedIdxs.test(
I))
20481 for (std::pair<unsigned, int> &Pair : EntryLanes)
20482 if (Pair.first ==
I)
20483 Pair.first = TempEntries.
size();
20486 Entries.swap(TempEntries);
20487 if (EntryLanes.size() == Entries.size() &&
20489 .slice(MaskBase,
getNumElems(
TE->Scalars.size(), SliceSize,
20496 return std::nullopt;
20499 bool IsIdentity = Entries.size() == 1;
20502 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
20503 unsigned Idx = MaskBase + Pair.second;
20506 (ForOrder ? std::distance(
20507 Entries[Pair.first]->Scalars.begin(),
20508 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
20509 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
20510 IsIdentity &=
Mask[Idx] == Pair.second;
20512 if (ForOrder || IsIdentity || Entries.empty()) {
20513 switch (Entries.size()) {
20515 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
20519 if (EntryLanes.size() > 2 || VL.size() <= 2)
20526 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
20528 SmallVector<int> SubMask(std::next(
Mask.begin(), MaskBase),
20529 std::next(
Mask.begin(), MaskBase + VL.size()));
20530 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
20531 for (
int Idx : SubMask) {
20539 assert(MaxElement >= 0 && MinElement >= 0 &&
20540 MaxElement % VF >= MinElement % VF &&
20541 "Expected at least single element.");
20546 unsigned MinIdx = MinElement % VF;
20549 *TTI, VL.front()->getType(), MinIdx);
20550 auto *RegFloorTy =
getWidenedType(VL.front()->getType(), RegFloor);
20551 unsigned RegFloorParts =
20553 if (RegFloorParts > 1)
20557 std::max<unsigned>(VL.size(), (MaxElement % VF) -
Offset + 1);
20559 for (
int &Idx : SubMask) {
20562 Idx = (Idx % VF) -
Offset + (Idx >=
static_cast<int>(VF) ? NewVF : 0);
20570 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
20571 auto GetShuffleCost = [&,
20572 &TTI = *TTI](ArrayRef<int>
Mask,
20575 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
20577 Mask, Entries.front()->getInterleaveFactor()))
20579 return ::getShuffleCost(TTI,
20584 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
20586 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
20587 if (Entries.size() == 1 || !Entries[0]->isGather()) {
20588 FirstShuffleCost = ShuffleCost;
20592 bool IsIdentity =
true;
20593 for (
auto [
I, Idx] :
enumerate(FirstMask)) {
20594 if (Idx >=
static_cast<int>(NewVF)) {
20599 IsIdentity &=
static_cast<int>(
I) == Idx;
20603 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
20605 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
20609 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
20610 if (Entries.size() == 1 || !Entries[1]->isGather()) {
20611 SecondShuffleCost = ShuffleCost;
20615 bool IsIdentity =
true;
20616 for (
auto [
I, Idx] :
enumerate(SecondMask)) {
20617 if (Idx <
static_cast<int>(NewVF) && Idx >= 0) {
20623 IsIdentity &=
static_cast<int>(
I) == Idx;
20628 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
20630 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
20638 *TTI, VL.front()->getType(), MaskVecTy, DemandedElts,
true,
20640 const TreeEntry *BestEntry =
nullptr;
20642 if (FirstShuffleCost < ShuffleCost) {
20643 for (
int &Idx : MaskSlice)
20644 if (Idx >=
static_cast<int>(VF))
20646 BestEntry = Entries.front();
20647 ShuffleCost = FirstShuffleCost;
20649 if (SecondShuffleCost < ShuffleCost) {
20650 for (
int &Idx : MaskSlice) {
20651 if (Idx <
static_cast<int>(VF))
20656 BestEntry = Entries[1];
20657 ShuffleCost = SecondShuffleCost;
20659 if (BuildVectorCost >= ShuffleCost) {
20662 Entries.push_back(BestEntry);
20670 std::fill(std::next(
Mask.begin(), MaskBase),
20672 return std::nullopt;
20676BoUpSLP::isGatherShuffledEntry(
20680 assert(NumParts > 0 && NumParts < VL.
size() &&
20681 "Expected positive number of registers.");
20684 if (TE == VectorizableTree.front().get() &&
20685 (!GatheredLoadsEntriesFirst.has_value() ||
20687 [](
const std::unique_ptr<TreeEntry> &TE) {
20688 return !
TE->isGather();
20692 assert((
TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
20693 "Expected only single user of the gather node.");
20696 if (
TE->UserTreeIndex &&
TE->UserTreeIndex.UserTE->isGather() &&
20697 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
20699 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
20702 getSameValuesTreeEntry(
TE->getMainOp(),
TE->Scalars))))
20707 if (Part * SliceSize >= VL.
size())
20711 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
20712 std::optional<TTI::ShuffleKind> SubRes =
20713 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
20714 ForOrder, SliceSize);
20716 SubEntries.
clear();
20719 SubEntries.
front()->getVectorFactor() == VL.
size() &&
20720 (SubEntries.
front()->isSame(
TE->Scalars) ||
20721 SubEntries.
front()->isSame(VL))) {
20723 LocalSubEntries.
swap(SubEntries);
20726 std::iota(
Mask.begin(),
Mask.end(), 0);
20728 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
20731 Entries.emplace_back(1, LocalSubEntries.
front());
20737 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
20745 Type *ScalarTy)
const {
20746 const unsigned VF = VL.
size();
20754 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
20756 if (
V->getType() != ScalarTy)
20757 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,
V->getType(),
20761 std::iota(ConstantShuffleMask.begin(), ConstantShuffleMask.end(), 0);
20768 ConstantShuffleMask[
I] =
I + VF;
20771 EstimateInsertCost(
I, V);
20774 bool IsAnyNonUndefConst =
20777 if (!ForPoisonSrc && IsAnyNonUndefConst) {
20779 ConstantShuffleMask);
20783 if (!DemandedElements.
isZero())
20787 ForPoisonSrc && !IsAnyNonUndefConst, VL);
20791Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *
E) {
20792 auto It = EntryToLastInstruction.find(
E);
20793 if (It != EntryToLastInstruction.end())
20801 if (
E->hasState()) {
20802 Front =
E->getMainOp();
20803 Opcode =
E->getOpcode();
20810 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
20811 E->isGather() &&
E->Idx < *GatheredLoadsEntriesFirst) ||
20812 E->State == TreeEntry::SplitVectorize ||
E->hasCopyableElements() ||
20814 [=](
Value *V) ->
bool {
20815 if (Opcode == Instruction::GetElementPtr &&
20816 !isa<GetElementPtrInst>(V))
20818 auto *I = dyn_cast<Instruction>(V);
20819 return !I || !E->getMatchingMainOpOrAltOp(I) ||
20820 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
20822 "Expected gathered loads or GEPs or instructions from same basic "
20825 auto FindLastInst = [&]() {
20827 for (
Value *V :
E->Scalars) {
20831 if (
E->isCopyableElement(
I))
20833 if (LastInst->
getParent() ==
I->getParent()) {
20838 assert(((Opcode == Instruction::GetElementPtr &&
20840 E->State == TreeEntry::SplitVectorize ||
20843 (GatheredLoadsEntriesFirst.has_value() &&
20844 Opcode == Instruction::Load &&
E->isGather() &&
20845 E->Idx < *GatheredLoadsEntriesFirst)) &&
20846 "Expected vector-like or non-GEP in GEP node insts only.");
20847 if (!DT->isReachableFromEntry(LastInst->
getParent())) {
20851 if (!DT->isReachableFromEntry(
I->getParent()))
20853 auto *NodeA = DT->getNode(LastInst->
getParent());
20854 auto *NodeB = DT->getNode(
I->getParent());
20855 assert(NodeA &&
"Should only process reachable instructions");
20856 assert(NodeB &&
"Should only process reachable instructions");
20857 assert((NodeA == NodeB) ==
20858 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
20859 "Different nodes should have different DFS numbers");
20860 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
20867 auto FindFirstInst = [&]() {
20869 for (
Value *V :
E->Scalars) {
20873 if (
E->isCopyableElement(
I))
20875 if (FirstInst->
getParent() ==
I->getParent()) {
20876 if (
I->comesBefore(FirstInst))
20880 assert(((Opcode == Instruction::GetElementPtr &&
20884 "Expected vector-like or non-GEP in GEP node insts only.");
20885 if (!DT->isReachableFromEntry(FirstInst->
getParent())) {
20889 if (!DT->isReachableFromEntry(
I->getParent()))
20891 auto *NodeA = DT->getNode(FirstInst->
getParent());
20892 auto *NodeB = DT->getNode(
I->getParent());
20893 assert(NodeA &&
"Should only process reachable instructions");
20894 assert(NodeB &&
"Should only process reachable instructions");
20895 assert((NodeA == NodeB) ==
20896 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
20897 "Different nodes should have different DFS numbers");
20898 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
20904 if (
E->State == TreeEntry::SplitVectorize) {
20905 Res = FindLastInst();
20907 for (
auto *
E : Entries) {
20910 I = &getLastInstructionInBundle(
E);
20915 EntryToLastInstruction.try_emplace(
E, Res);
20920 if (GatheredLoadsEntriesFirst.has_value() &&
20921 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
20922 Opcode == Instruction::Load) {
20923 Res = FindFirstInst();
20924 EntryToLastInstruction.try_emplace(
E, Res);
20930 auto FindScheduleBundle = [&](
const TreeEntry *
E) ->
const ScheduleBundle * {
20934 const auto *It = BlocksSchedules.find(BB);
20935 if (It == BlocksSchedules.end())
20937 for (
Value *V :
E->Scalars) {
20943 if (Bundles.
empty())
20946 Bundles, [&](ScheduleBundle *
B) {
return B->getTreeEntry() ==
E; });
20947 if (It != Bundles.
end())
20952 const ScheduleBundle *Bundle = FindScheduleBundle(
E);
20953 if (!
E->isGather() && !Bundle) {
20954 if ((Opcode == Instruction::GetElementPtr &&
20957 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
20961 return isa<PoisonValue>(V) ||
20962 (E->Idx == 0 && isa<InsertElementInst>(V)) ||
20963 E->isCopyableElement(V) ||
20964 (!isVectorLikeInstWithConstOps(V) &&
20965 isUsedOutsideBlock(V));
20967 (!
E->doesNotNeedToSchedule() ||
20970 if (!isa<Instruction>(V) ||
20971 (E->hasCopyableElements() && E->isCopyableElement(V)))
20973 return !areAllOperandsNonInsts(V);
20976 if (!isa<Instruction>(V) ||
20977 (E->hasCopyableElements() && E->isCopyableElement(V)))
20979 return MustGather.contains(V);
20981 Res = FindLastInst();
20983 Res = FindFirstInst();
20984 EntryToLastInstruction.try_emplace(
E, Res);
20993 assert(!
E->isGather() &&
"Gathered instructions should not be scheduled");
20994 Res = Bundle->getBundle().back()->getInst();
20995 EntryToLastInstruction.try_emplace(
E, Res);
21018 Res = FindLastInst();
21019 assert(Res &&
"Failed to find last instruction in bundle");
21020 EntryToLastInstruction.try_emplace(
E, Res);
21024void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *
E) {
21025 auto *Front =
E->getMainOp();
21026 Instruction *LastInst = &getLastInstructionInBundle(
E);
21027 assert(LastInst &&
"Failed to find last instruction in bundle");
21032 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
21033 if (LastInstIt != LastInst->
getParent()->end() &&
21034 LastInstIt->getParent()->isLandingPad())
21035 LastInstIt = std::next(LastInstIt);
21038 (!
E->isGather() &&
E->State != TreeEntry::SplitVectorize &&
21039 (
E->doesNotNeedToSchedule() ||
21040 (
E->hasCopyableElements() && !
E->isCopyableElement(LastInst) &&
21042 (GatheredLoadsEntriesFirst.has_value() &&
21043 E->Idx >= *GatheredLoadsEntriesFirst && !
E->isGather() &&
21044 E->getOpcode() == Instruction::Load)) {
21045 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
21049 Builder.SetInsertPoint(
21052 if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
21055 Res = Builder.CreateAlignedLoad(Builder.getPtrTy(),
21060 if (
E->State != TreeEntry::SplitVectorize)
21061 LastInstructionToPos.try_emplace(LastInst, Res);
21064 Builder.SetCurrentDebugLocation(Front->
getDebugLoc());
21067Value *BoUpSLP::gather(
21069 function_ref<
Value *(
Value *,
Value *, ArrayRef<int>)> CreateShuffle) {
21075 SmallSet<int, 4> PostponedIndices;
21076 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
21078 SmallPtrSet<BasicBlock *, 4> Visited;
21079 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
21080 InsertBB = InsertBB->getSinglePredecessor();
21081 return InsertBB && InsertBB == InstBB;
21083 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
21085 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
21087 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
21088 PostponedIndices.
insert(
I).second)
21092 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
21099 if (
Scalar->getType() != Ty) {
21110 Scalar = Builder.CreateIntCast(
21124 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
21129 GatherShuffleExtractSeq.insert(InsElt);
21134 const auto *It =
find_if(Entries, [&](
const TreeEntry *
E) {
21135 return !TransformedToGatherNodes.contains(
E) &&
21136 !DeletedNodes.contains(
E);
21138 if (It != Entries.
end()) {
21140 User *UserOp =
nullptr;
21145 if (
V->getType()->isVectorTy()) {
21147 SV && SV->getOperand(0) != V && SV->getOperand(1) != V) {
21149 auto FindOperand = [](
Value *Vec,
Value *
V) -> Instruction * {
21151 if (SV->getOperand(0) == V)
21153 if (SV->getOperand(1) == V)
21159 if (Instruction *User = FindOperand(SV->getOperand(0), V))
21161 else if (Instruction *User = FindOperand(SV->getOperand(1), V))
21164 "Failed to find shufflevector, caused by resize.");
21171 unsigned FoundLane = (*It)->findLaneForValue(V);
21172 ExternalUses.emplace_back(V,
nullptr, **It, FoundLane);
21173 ExternalUsesWithNonUsers.insert(V);
21179 unsigned FoundLane = (*It)->findLaneForValue(V);
21180 ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
21188 SmallVector<int> NonConsts;
21190 std::iota(
Mask.begin(),
Mask.end(), 0);
21191 Value *OriginalRoot = Root;
21194 SV->getOperand(0)->getType() == VecTy) {
21195 Root = SV->getOperand(0);
21196 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
21199 for (
int I = 0,
E = VL.
size();
I <
E; ++
I) {
21208 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
21213 Vec = OriginalRoot;
21215 Vec = CreateShuffle(Root, Vec, Mask);
21217 OI && OI->use_empty() &&
21218 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
21219 return TE->VectorizedValue == OI;
21225 for (
int I : NonConsts)
21226 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
21229 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
21230 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
21268 bool IsFinalized =
false;
21281 class ShuffleIRBuilder {
21294 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
21295 CSEBlocks(CSEBlocks),
DL(DL) {}
21296 ~ShuffleIRBuilder() =
default;
21302 "Expected integer vector types only.");
21308 ->getIntegerBitWidth())
21309 V2 = Builder.CreateIntCast(
21312 V1 = Builder.CreateIntCast(
21316 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
21318 GatherShuffleExtractSeq.insert(
I);
21319 CSEBlocks.insert(
I->getParent());
21328 unsigned VF = Mask.size();
21332 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
21334 GatherShuffleExtractSeq.insert(
I);
21335 CSEBlocks.insert(
I->getParent());
21339 Value *createIdentity(
Value *V) {
return V; }
21340 Value *createPoison(
Type *Ty,
unsigned VF) {
21345 void resizeToMatch(
Value *&V1,
Value *&V2) {
21350 int VF = std::max(V1VF, V2VF);
21351 int MinVF = std::min(V1VF, V2VF);
21353 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
21355 Value *&
Op = MinVF == V1VF ? V1 : V2;
21356 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
21358 GatherShuffleExtractSeq.insert(
I);
21359 CSEBlocks.insert(
I->getParent());
21372 assert(V1 &&
"Expected at least one vector value.");
21373 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
21374 R.CSEBlocks, *R.DL);
21375 return BaseShuffleAnalysis::createShuffle<Value *>(
21376 V1, V2, Mask, ShuffleBuilder, ScalarTy);
21382 std::optional<bool> IsSigned = std::nullopt) {
21385 if (VecTy->getElementType() == ScalarTy->getScalarType())
21387 return Builder.CreateIntCast(
21388 V,
VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
21392 Value *getVectorizedValue(
const TreeEntry &E) {
21393 Value *Vec = E.VectorizedValue;
21396 return castToScalarTyElem(Vec,
any_of(E.Scalars, [&](
Value *V) {
21397 return !isa<PoisonValue>(V) &&
21398 !isKnownNonNegative(
21399 V, SimplifyQuery(*R.DL));
21405 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
21409 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
21410 unsigned NumParts,
bool &UseVecBaseAsInput) {
21411 UseVecBaseAsInput =
false;
21413 Value *VecBase =
nullptr;
21415 if (!E->ReorderIndices.empty()) {
21417 E->ReorderIndices.end());
21420 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
21425 VecBase = EI->getVectorOperand();
21427 VecBase = TEs.front()->VectorizedValue;
21428 assert(VecBase &&
"Expected vectorized value.");
21429 UniqueBases.
insert(VecBase);
21432 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
21433 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
21434 !R.isVectorized(EI) &&
21436 count_if(E->UserTreeIndex.UserTE->Scalars,
21437 [&](
Value *V) { return V == EI; })) ||
21438 (NumParts != 1 &&
count(VL, EI) > 1) ||
21440 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(U);
21441 return UTEs.empty() || UTEs.size() > 1 ||
21443 [&](const TreeEntry *TE) {
21444 return R.DeletedNodes.contains(TE) ||
21445 R.TransformedToGatherNodes.contains(TE);
21451 [&](
const std::unique_ptr<TreeEntry> &TE) {
21452 return TE->UserTreeIndex.UserTE ==
21454 is_contained(VL, EI);
21458 R.eraseInstruction(EI);
21460 if (NumParts == 1 || UniqueBases.
size() == 1) {
21461 assert(VecBase &&
"Expected vectorized value.");
21462 return castToScalarTyElem(VecBase);
21464 UseVecBaseAsInput =
true;
21474 Value *Vec =
nullptr;
21481 constexpr int MaxBases = 2;
21483 auto VLMask =
zip(SubVL, SubMask);
21484 const unsigned VF =
21485 accumulate(VLMask, 0U, [&](
unsigned S,
const auto &
D) {
21492 VecOp = TEs.front()->VectorizedValue;
21493 assert(VecOp &&
"Expected vectorized value.");
21494 const unsigned Size =
21496 return std::max(S,
Size);
21498 for (
const auto [V,
I] : VLMask) {
21503 VecOp = TEs.front()->VectorizedValue;
21504 assert(VecOp &&
"Expected vectorized value.");
21505 VecOp = castToScalarTyElem(VecOp);
21506 Bases[
I / VF] = VecOp;
21508 if (!Bases.front())
21511 if (Bases.back()) {
21512 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
21513 TransformToIdentity(SubMask);
21515 SubVec = Bases.front();
21521 ArrayRef<int> SubMask =
21522 Mask.slice(
P * SliceSize,
21525 return all_of(SubMask, [](
int Idx) {
21529 "Expected first part or all previous parts masked.");
21530 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
21535 unsigned SubVecVF =
21537 NewVF = std::max(NewVF, SubVecVF);
21540 for (
int &Idx : SubMask)
21543 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
21544 Vec = createShuffle(Vec, SubVec, VecMask);
21545 TransformToIdentity(VecMask);
21553 std::optional<Value *>
21559 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
21561 return std::nullopt;
21564 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
21565 return Builder.CreateAlignedLoad(
21572 IsFinalized =
false;
21573 CommonMask.clear();
21579 Value *V1 = getVectorizedValue(E1);
21580 Value *V2 = getVectorizedValue(E2);
21586 Value *V1 = getVectorizedValue(E1);
21591 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
21594 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
21595 V1 = castToScalarTyElem(V1);
21596 V2 = castToScalarTyElem(V2);
21597 if (InVectors.empty()) {
21598 InVectors.push_back(V1);
21599 InVectors.push_back(V2);
21600 CommonMask.assign(Mask.begin(), Mask.end());
21603 Value *Vec = InVectors.front();
21604 if (InVectors.size() == 2) {
21605 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
21606 transformMaskAfterShuffle(CommonMask, CommonMask);
21609 Vec = createShuffle(Vec,
nullptr, CommonMask);
21610 transformMaskAfterShuffle(CommonMask, CommonMask);
21612 V1 = createShuffle(V1, V2, Mask);
21613 unsigned VF = std::max(getVF(V1), getVF(Vec));
21614 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21616 CommonMask[Idx] = Idx + VF;
21617 InVectors.front() = Vec;
21618 if (InVectors.size() == 2)
21619 InVectors.back() = V1;
21621 InVectors.push_back(V1);
21626 "castToScalarTyElem expects V1 to be FixedVectorType");
21627 V1 = castToScalarTyElem(V1);
21628 if (InVectors.empty()) {
21629 InVectors.push_back(V1);
21630 CommonMask.assign(Mask.begin(), Mask.end());
21633 const auto *It =
find(InVectors, V1);
21634 if (It == InVectors.end()) {
21635 if (InVectors.size() == 2 ||
21636 InVectors.front()->getType() != V1->
getType()) {
21637 Value *V = InVectors.front();
21638 if (InVectors.size() == 2) {
21639 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
21640 transformMaskAfterShuffle(CommonMask, CommonMask);
21642 CommonMask.size()) {
21643 V = createShuffle(InVectors.front(),
nullptr, CommonMask);
21644 transformMaskAfterShuffle(CommonMask, CommonMask);
21646 unsigned VF = std::max(CommonMask.size(), Mask.size());
21647 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21649 CommonMask[Idx] = V->getType() != V1->
getType()
21651 : Mask[Idx] + getVF(V1);
21652 if (V->getType() != V1->
getType())
21653 V1 = createShuffle(V1,
nullptr, Mask);
21654 InVectors.front() = V;
21655 if (InVectors.size() == 2)
21656 InVectors.back() = V1;
21658 InVectors.push_back(V1);
21663 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21665 InVectors.push_back(V1);
21670 for (
Value *V : InVectors)
21671 VF = std::max(VF, getVF(V));
21672 for (
unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
21674 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
21683 Value *Root =
nullptr) {
21684 return R.gather(VL, Root, ScalarTy,
21686 return createShuffle(V1, V2, Mask);
21695 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
21700 IsFinalized =
true;
21703 if (InVectors.
size() == 2) {
21704 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
21707 Vec = createShuffle(Vec,
nullptr, CommonMask);
21709 transformMaskAfterShuffle(CommonMask, CommonMask);
21711 "Expected vector length for the final value before action.");
21715 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
21716 Vec = createShuffle(Vec,
nullptr, ResizeMask);
21718 Action(Vec, CommonMask, [
this](
Value *V1,
Value *V2, ArrayRef<int> Mask) {
21719 return createShuffle(V1, V2, Mask);
21721 InVectors.
front() = Vec;
21723 if (!SubVectors.empty()) {
21725 if (InVectors.
size() == 2) {
21726 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
21729 Vec = createShuffle(Vec,
nullptr, CommonMask);
21731 transformMaskAfterShuffle(CommonMask, CommonMask);
21732 auto CreateSubVectors = [&](
Value *Vec,
21733 SmallVectorImpl<int> &CommonMask) {
21734 for (
auto [
E, Idx] : SubVectors) {
21735 Value *
V = getVectorizedValue(*
E);
21742 Type *OrigScalarTy = ScalarTy;
21745 Builder, Vec, V, InsertionIndex,
21746 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
21748 ScalarTy = OrigScalarTy;
21749 if (!CommonMask.
empty()) {
21750 std::iota(std::next(CommonMask.
begin(), Idx),
21751 std::next(CommonMask.
begin(), Idx +
E->getVectorFactor()),
21757 if (SubVectorsMask.
empty()) {
21758 Vec = CreateSubVectors(Vec, CommonMask);
21761 copy(SubVectorsMask, SVMask.begin());
21762 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
21765 I1 = I2 + CommonMask.
size();
21770 Vec = createShuffle(InsertVec, Vec, SVMask);
21771 transformMaskAfterShuffle(CommonMask, SVMask);
21773 InVectors.
front() = Vec;
21776 if (!ExtMask.
empty()) {
21777 if (CommonMask.
empty()) {
21781 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
21784 NewMask[
I] = CommonMask[ExtMask[
I]];
21786 CommonMask.
swap(NewMask);
21789 if (CommonMask.
empty()) {
21790 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
21791 return InVectors.
front();
21793 if (InVectors.
size() == 2)
21794 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
21795 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
21799 assert((IsFinalized || CommonMask.empty()) &&
21800 "Shuffle construction must be finalized.");
21804Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx) {
21808template <
typename BVTy,
typename ResTy,
typename... Args>
21809ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
21811 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
21812 "Expected gather node.");
21813 unsigned VF = E->getVectorFactor();
21815 bool NeedFreeze =
false;
21819 E->CombinedEntriesWithIndices.size());
21820 if (E->State == TreeEntry::SplitVectorize &&
21821 TransformedToGatherNodes.contains(E)) {
21822 SubVectors.
clear();
21825 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
21827 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
21830 E->CombinedEntriesWithIndices, SubVectors.
begin(), [&](
const auto &
P) {
21831 return std::make_pair(VectorizableTree[P.first].get(), P.second);
21837 E->ReorderIndices.end());
21838 if (!ReorderMask.
empty())
21844 if (!SubVectors.
empty() && !SubVectorsMask.
empty()) {
21846 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
21849 SubVectorsMask.
clear();
21853 unsigned I,
unsigned SliceSize,
21854 bool IsNotPoisonous) {
21856 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
21859 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
21860 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
21861 if (UserTE->getNumOperands() != 2)
21863 if (!IsNotPoisonous) {
21864 auto *It =
find_if(
ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
21865 [=](
const std::unique_ptr<TreeEntry> &TE) {
21866 return TE->UserTreeIndex.UserTE == UserTE &&
21867 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
21869 if (It == VectorizableTree.end())
21872 if (!(*It)->ReorderIndices.empty()) {
21876 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
21877 Value *V0 = std::get<0>(
P);
21878 Value *V1 = std::get<1>(
P);
21886 if ((Mask.size() < InputVF &&
21889 (Mask.size() == InputVF &&
21892 std::next(Mask.begin(),
I * SliceSize),
21893 std::next(Mask.begin(),
21900 std::next(Mask.begin(),
I * SliceSize),
21901 std::next(Mask.begin(),
21907 BVTy ShuffleBuilder(ScalarTy, Params...);
21908 ResTy Res = ResTy();
21912 Value *ExtractVecBase =
nullptr;
21913 bool UseVecBaseAsInput =
false;
21916 Type *OrigScalarTy = GatheredScalars.
front()->getType();
21918 unsigned NumParts =
21922 bool Resized =
false;
21924 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
21925 if (!ExtractShuffles.
empty()) {
21927 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
21933 ExtractEntries.
append(TEs.begin(), TEs.end());
21935 if (std::optional<ResTy> Delayed =
21936 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
21938 PostponedGathers.insert(E);
21943 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
21944 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
21945 ExtractVecBase = VecBase;
21947 if (VF == VecBaseTy->getNumElements() &&
21948 GatheredScalars.
size() != VF) {
21950 GatheredScalars.
append(VF - GatheredScalars.
size(),
21958 if (!ExtractShuffles.
empty() || !E->hasState() ||
21959 E->getOpcode() != Instruction::Load ||
21960 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
21964 return isa<LoadInst>(V) && isVectorized(V);
21966 (E->hasState() && E->isAltShuffle()) ||
21967 all_of(E->Scalars, [
this](
Value *V) { return isVectorized(V); }) ||
21969 (E->Scalars != GatheredScalars && GatheredScalars.
size() <= 2)) {
21971 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
21973 if (!GatherShuffles.
empty()) {
21974 if (std::optional<ResTy> Delayed =
21975 ShuffleBuilder.needToDelay(E, Entries)) {
21977 PostponedGathers.insert(E);
21982 if (GatherShuffles.
size() == 1 &&
21984 (Entries.
front().front()->isSame(E->Scalars) ||
21985 E->isSame(Entries.
front().front()->Scalars))) {
21988 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
21991 Mask.resize(E->Scalars.size());
21992 const TreeEntry *FrontTE = Entries.
front().front();
21993 if (FrontTE->ReorderIndices.empty() && E->ReorderIndices.empty() &&
21994 ((FrontTE->ReuseShuffleIndices.empty() &&
21995 E->Scalars.size() == FrontTE->Scalars.size()) ||
21996 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
21997 std::iota(Mask.begin(), Mask.end(), 0);
22004 Mask[
I] = FrontTE->findLaneForValue(V);
22009 ShuffleBuilder.resetForSameNode();
22011 if ((E->isSame(FrontTE->Scalars) &&
22012 FrontTE->ReuseShuffleIndices.empty() &&
22013 FrontTE->ReorderIndices.empty() &&
22014 E->getVectorFactor() == FrontTE->getVectorFactor()) ||
22015 (
equal(E->Scalars, FrontTE->Scalars) &&
22016 equal(E->ReorderIndices, FrontTE->ReorderIndices) &&
22017 equal(E->ReuseShuffleIndices, FrontTE->ReuseShuffleIndices))) {
22018 Mask.resize(FrontTE->getVectorFactor());
22019 std::iota(Mask.begin(), Mask.end(), 0);
22020 ShuffleBuilder.add(*FrontTE, Mask);
22021 Res = ShuffleBuilder.finalize({}, {}, {});
22023 ShuffleBuilder.add(*FrontTE, Mask);
22024 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
22029 if (GatheredScalars.
size() != VF &&
22031 return any_of(TEs, [&](
const TreeEntry *TE) {
22032 return TE->getVectorFactor() == VF;
22035 GatheredScalars.
append(VF - GatheredScalars.
size(),
22039 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
22047 bool IsRootPoison) {
22050 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
22057 int NumNonConsts = 0;
22076 Scalars.
front() = OrigV;
22079 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
22080 Scalars[Res.first->second] = OrigV;
22081 ReuseMask[
I] = Res.first->second;
22084 if (NumNonConsts == 1) {
22089 if (!UndefPos.
empty() && UndefPos.
front() == 0)
22092 ReuseMask[SinglePos] = SinglePos;
22093 }
else if (!UndefPos.
empty() && IsSplat) {
22100 (E->UserTreeIndex &&
any_of(V->uses(), [E](
const Use &U) {
22103 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
22104 is_contained(E->UserTreeIndex.UserTE->Scalars,
22108 if (It != Scalars.
end()) {
22110 int Pos = std::distance(Scalars.
begin(), It);
22111 for (
int I : UndefPos) {
22113 ReuseMask[
I] = Pos;
22122 for (
int I : UndefPos) {
22131 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
22132 bool IsNonPoisoned =
true;
22133 bool IsUsedInExpr =
true;
22134 Value *Vec1 =
nullptr;
22135 if (!ExtractShuffles.
empty()) {
22139 Value *Vec2 =
nullptr;
22140 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
22144 if (UseVecBaseAsInput) {
22145 Vec1 = ExtractVecBase;
22147 for (
unsigned I = 0, Sz = ExtractMask.
size();
I < Sz; ++
I) {
22153 Value *VecOp = EI->getVectorOperand();
22155 !TEs.
empty() && TEs.front()->VectorizedValue)
22156 VecOp = TEs.front()->VectorizedValue;
22159 }
else if (Vec1 != VecOp) {
22160 assert((!Vec2 || Vec2 == VecOp) &&
22161 "Expected only 1 or 2 vectors shuffle.");
22167 IsUsedInExpr =
false;
22170 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
22173 IsUsedInExpr &= FindReusedSplat(
22176 ExtractMask.
size(), IsNotPoisonedVec);
22177 ShuffleBuilder.add(Vec1, ExtractMask,
true);
22178 IsNonPoisoned &= IsNotPoisonedVec;
22180 IsUsedInExpr =
false;
22185 if (!GatherShuffles.
empty()) {
22187 if (Mask.size() == E->Scalars.size())
22192 for (
const auto [
I, TEs] :
enumerate(Entries)) {
22195 "No shuffles with empty entries list expected.");
22198 assert((TEs.size() == 1 || TEs.size() == 2) &&
22199 "Expected shuffle of 1 or 2 entries.");
22200 unsigned Limit =
getNumElems(Mask.size(), SliceSize,
I);
22203 copy(SubMask, std::next(VecMask.
begin(),
I * SliceSize));
22204 if (TEs.size() == 1) {
22205 bool IsNotPoisonedVec =
22206 TEs.front()->VectorizedValue
22210 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
22211 SliceSize, IsNotPoisonedVec);
22212 ShuffleBuilder.add(*TEs.front(), VecMask);
22213 IsNonPoisoned &= IsNotPoisonedVec;
22215 IsUsedInExpr =
false;
22216 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
22217 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
22228 int EMSz = ExtractMask.
size();
22229 int MSz = Mask.size();
22232 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
22233 bool IsIdentityShuffle =
22234 ((UseVecBaseAsInput ||
22236 [](
const std::optional<TTI::ShuffleKind> &SK) {
22240 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
22242 (!GatherShuffles.
empty() &&
22244 [](
const std::optional<TTI::ShuffleKind> &SK) {
22248 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
22250 bool EnoughConstsForShuffle =
22260 (!IsIdentityShuffle ||
22261 (GatheredScalars.
size() == 2 &&
22269 for (
int I = 0, Sz = GatheredScalars.
size();
I < Sz; ++
I) {
22270 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
22278 TryPackScalars(GatheredScalars, BVMask,
true);
22279 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.
size());
22280 ShuffleBuilder.add(BV, BVMask);
22284 (IsSingleShuffle && ((IsIdentityShuffle &&
22287 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22290 Res = ShuffleBuilder.finalize(
22291 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.
size(),
22293 bool IsSplat = isSplat(NonConstants);
22294 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
22295 TryPackScalars(NonConstants, BVMask, false);
22296 auto CheckIfSplatIsProfitable = [&]() {
22299 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
22300 Value *V = *find_if_not(NonConstants, IsaPred<UndefValue>);
22301 if (isa<ExtractElementInst>(V) || isVectorized(V))
22303 InstructionCost SplatCost = TTI->getVectorInstrCost(
22304 Instruction::InsertElement, VecTy, CostKind, 0,
22305 PoisonValue::get(VecTy), V);
22306 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22307 for (auto [Idx, I] : enumerate(BVMask))
22308 if (I != PoisonMaskElem)
22309 NewMask[Idx] = Mask.size();
22310 SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
22311 NewMask, CostKind);
22312 InstructionCost BVCost = TTI->getVectorInstrCost(
22313 Instruction::InsertElement, VecTy, CostKind,
22314 *find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
22316 if (count(BVMask, PoisonMaskElem) <
22317 static_cast<int>(BVMask.size() - 1)) {
22318 SmallVector<int> NewMask(Mask.begin(), Mask.end());
22319 for (auto [Idx, I] : enumerate(BVMask))
22320 if (I != PoisonMaskElem)
22322 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
22323 VecTy, NewMask, CostKind);
22325 return SplatCost <= BVCost;
22327 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
22331 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
22337 Value *BV = ShuffleBuilder.gather(Values, BVMask.
size());
22340 return I == PoisonMaskElem ? PoisonMaskElem : 0;
22343 BV = CreateShuffle(BV,
nullptr, SplatMask);
22346 Mask[Idx] = BVMask.size() + Idx;
22347 Vec = CreateShuffle(Vec, BV, Mask);
22356 TryPackScalars(GatheredScalars, ReuseMask,
true);
22357 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
22358 ShuffleBuilder.add(BV, ReuseMask);
22359 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
22364 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
22368 Value *BV = ShuffleBuilder.gather(GatheredScalars);
22369 ShuffleBuilder.add(BV, Mask);
22370 Res = ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors,
22375 Res = ShuffleBuilder.createFreeze(Res);
22379Value *BoUpSLP::createBuildVector(
const TreeEntry *
E,
Type *ScalarTy) {
22381 if (
E->State != TreeEntry::SplitVectorize ||
22382 !TransformedToGatherNodes.contains(
E)) {
22383 for (
auto [EIdx,
_] :
E->CombinedEntriesWithIndices)
22386 return processBuildVector<ShuffleInstructionBuilder, Value *>(
E, ScalarTy,
22394 for (
Value *V : VL)
22407 IRBuilderBase::InsertPointGuard Guard(Builder);
22409 Value *
V =
E->Scalars.front();
22411 auto It = MinBWs.find(
E);
22412 if (It != MinBWs.end()) {
22418 if (
E->VectorizedValue)
22419 return E->VectorizedValue;
22421 if (
E->isGather() || TransformedToGatherNodes.contains(
E)) {
22423 if (
E->hasState() &&
E->Idx == 0 && !UserIgnoreList)
22424 setInsertPointAfterBundle(
E);
22425 Value *Vec = createBuildVector(
E, ScalarTy);
22426 E->VectorizedValue = Vec;
22429 if (
E->State == TreeEntry::SplitVectorize) {
22430 assert(
E->CombinedEntriesWithIndices.size() == 2 &&
22431 "Expected exactly 2 combined entries.");
22432 setInsertPointAfterBundle(
E);
22434 *VectorizableTree[
E->CombinedEntriesWithIndices.front().first];
22436 ArrayRef(
E->Scalars).take_front(OpTE1.getVectorFactor())) &&
22437 "Expected same first part of scalars.");
22440 *VectorizableTree[
E->CombinedEntriesWithIndices.back().first];
22442 OpTE2.isSame(
ArrayRef(
E->Scalars).take_back(OpTE2.getVectorFactor())) &&
22443 "Expected same second part of scalars.");
22445 auto GetOperandSignedness = [&](
const TreeEntry *OpE) {
22446 bool IsSigned =
false;
22447 auto It = MinBWs.find(OpE);
22448 if (It != MinBWs.end())
22449 IsSigned = It->second.second;
22452 if (isa<PoisonValue>(V))
22454 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22461 Op1 = Builder.CreateIntCast(
22466 GetOperandSignedness(&OpTE1));
22471 Op2 = Builder.CreateIntCast(
22476 GetOperandSignedness(&OpTE2));
22478 if (
E->ReorderIndices.empty()) {
22482 std::next(
Mask.begin(),
E->CombinedEntriesWithIndices.back().second),
22485 if (ScalarTyNumElements != 1) {
22489 Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
22491 E->CombinedEntriesWithIndices.back().second *
22492 ScalarTyNumElements);
22493 E->VectorizedValue = Vec;
22496 unsigned CommonVF =
22497 std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
22504 Op1 = Builder.CreateShuffleVector(Op1, Mask);
22510 Op2 = Builder.CreateShuffleVector(Op2, Mask);
22512 Value *Vec = Builder.CreateShuffleVector(Op1, Op2,
E->getSplitMask());
22513 E->VectorizedValue = Vec;
22517 bool IsReverseOrder =
22519 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *
E) {
22521 if (
E->getOpcode() == Instruction::Store &&
22522 E->State == TreeEntry::Vectorize) {
22523 ArrayRef<int>
Mask =
22524 ArrayRef(
reinterpret_cast<const int *
>(
E->ReorderIndices.begin()),
22525 E->ReorderIndices.size());
22526 ShuffleBuilder.add(V, Mask);
22527 }
else if ((
E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
22528 E->State == TreeEntry::CompressVectorize) {
22529 ShuffleBuilder.addOrdered(V, {});
22531 ShuffleBuilder.addOrdered(V,
E->ReorderIndices);
22534 E->CombinedEntriesWithIndices.size());
22536 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
22537 return std::make_pair(VectorizableTree[P.first].get(), P.second);
22540 (
E->CombinedEntriesWithIndices.empty() ||
E->ReorderIndices.empty()) &&
22541 "Expected either combined subnodes or reordering");
22542 return ShuffleBuilder.finalize(
E->ReuseShuffleIndices, SubVectors, {});
22545 assert(!
E->isGather() &&
"Unhandled state");
22546 unsigned ShuffleOrOp =
22547 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector :
E->getOpcode();
22548 if (!
E->isAltShuffle()) {
22549 switch (E->CombinedOp) {
22550 case TreeEntry::ReducedBitcast:
22551 case TreeEntry::ReducedBitcastBSwap:
22552 case TreeEntry::ReducedBitcastLoads:
22553 case TreeEntry::ReducedBitcastBSwapLoads:
22554 case TreeEntry::ReducedCmpBitcast:
22555 ShuffleOrOp = E->CombinedOp;
22562 auto GetOperandSignedness = [&](
unsigned Idx) {
22563 const TreeEntry *OpE = getOperandEntry(
E, Idx);
22564 bool IsSigned =
false;
22565 auto It = MinBWs.find(OpE);
22566 if (It != MinBWs.end())
22567 IsSigned = It->second.second;
22570 if (isa<PoisonValue>(V))
22572 return !isKnownNonNegative(R, SimplifyQuery(*DL));
22576 switch (ShuffleOrOp) {
22577 case Instruction::PHI: {
22578 assert((
E->ReorderIndices.empty() || !
E->ReuseShuffleIndices.empty() ||
22579 E != VectorizableTree.front().get() ||
E->UserTreeIndex) &&
22580 "PHI reordering is free.");
22582 Builder.SetInsertPoint(PH->getParent(),
22583 PH->getParent()->getFirstNonPHIIt());
22585 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
22589 Builder.SetInsertPoint(PH->getParent(),
22590 PH->getParent()->getFirstInsertionPt());
22593 V = FinalShuffle(V,
E);
22595 E->VectorizedValue =
V;
22602 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
22608 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
22614 TreeEntry *OpTE = getOperandEntry(
E,
I);
22615 if (OpTE->isGather() || DeletedNodes.contains(OpTE) ||
22616 TransformedToGatherNodes.contains(OpTE)) {
22619 assert(!OpTE->VectorizedValue &&
"Expected no vectorized value.");
22620 OpTE->VectorizedValue = VecOp;
22627 Value *Vec = vectorizeOperand(
E,
I);
22628 if (VecTy != Vec->
getType()) {
22630 MinBWs.contains(getOperandEntry(
E,
I))) &&
22631 "Expected item in MinBWs.");
22632 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
22638 "Invalid number of incoming values");
22639 assert(
E->VectorizedValue &&
"Expected vectorized value.");
22640 return E->VectorizedValue;
22643 case Instruction::ExtractElement: {
22644 Value *
V =
E->getSingleOperand(0);
22645 setInsertPointAfterBundle(
E);
22646 V = FinalShuffle(V,
E);
22647 E->VectorizedValue =
V;
22650 case Instruction::ExtractValue: {
22652 Builder.SetInsertPoint(LI);
22653 Value *Ptr = LI->getPointerOperand();
22654 LoadInst *
V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
22656 NewV = FinalShuffle(NewV,
E);
22657 E->VectorizedValue = NewV;
22660 case Instruction::InsertElement: {
22661 assert(
E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
22662 if (
const TreeEntry *OpE = getOperandEntry(
E, 1);
22663 OpE && !OpE->isGather() && OpE->hasState() &&
22664 !OpE->hasCopyableElements())
22667 setInsertPointAfterBundle(
E);
22668 Value *
V = vectorizeOperand(
E, 1);
22670 Type *ScalarTy =
Op.front()->getType();
22673 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(
E, 1));
22674 assert(Res.first > 0 &&
"Expected item in MinBWs.");
22675 V = Builder.CreateIntCast(
22685 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
22687 const unsigned NumElts =
22689 const unsigned NumScalars =
E->Scalars.size();
22692 assert(
Offset < NumElts &&
"Failed to find vector index offset");
22695 SmallVector<int>
Mask;
22696 if (!
E->ReorderIndices.empty()) {
22701 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
22704 bool IsIdentity =
true;
22706 Mask.swap(PrevMask);
22707 for (
unsigned I = 0;
I < NumScalars; ++
I) {
22710 IsIdentity &= InsertIdx -
Offset ==
I;
22713 if (!IsIdentity || NumElts != NumScalars) {
22714 Value *V2 =
nullptr;
22715 bool IsVNonPoisonous =
22717 SmallVector<int> InsertMask(Mask);
22718 if (NumElts != NumScalars &&
Offset == 0) {
22727 InsertMask[*InsertIdx] = *InsertIdx;
22733 SmallBitVector UseMask =
22734 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
22735 SmallBitVector IsFirstPoison =
22737 SmallBitVector IsFirstUndef =
22739 if (!IsFirstPoison.
all()) {
22741 for (
unsigned I = 0;
I < NumElts;
I++) {
22743 IsFirstUndef.
test(
I)) {
22744 if (IsVNonPoisonous) {
22745 InsertMask[
I] =
I < NumScalars ?
I : 0;
22750 if (Idx >= NumScalars)
22751 Idx = NumScalars - 1;
22752 InsertMask[
I] = NumScalars + Idx;
22765 V = Builder.CreateShuffleVector(V, V2, InsertMask);
22767 GatherShuffleExtractSeq.insert(
I);
22768 CSEBlocks.insert(
I->getParent());
22773 for (
unsigned I = 0;
I < NumElts;
I++) {
22777 SmallBitVector UseMask =
22778 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
22779 SmallBitVector IsFirstUndef =
22781 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
22782 NumElts != NumScalars) {
22783 if (IsFirstUndef.
all()) {
22785 SmallBitVector IsFirstPoison =
22787 if (!IsFirstPoison.
all()) {
22788 for (
unsigned I = 0;
I < NumElts;
I++) {
22790 InsertMask[
I] =
I + NumElts;
22793 V = Builder.CreateShuffleVector(
22799 GatherShuffleExtractSeq.insert(
I);
22800 CSEBlocks.insert(
I->getParent());
22804 SmallBitVector IsFirstPoison =
22806 for (
unsigned I = 0;
I < NumElts;
I++) {
22810 InsertMask[
I] += NumElts;
22812 V = Builder.CreateShuffleVector(
22813 FirstInsert->getOperand(0), V, InsertMask,
22816 GatherShuffleExtractSeq.insert(
I);
22817 CSEBlocks.insert(
I->getParent());
22822 ++NumVectorInstructions;
22823 E->VectorizedValue =
V;
22826 case Instruction::ZExt:
22827 case Instruction::SExt:
22828 case Instruction::FPToUI:
22829 case Instruction::FPToSI:
22830 case Instruction::FPExt:
22831 case Instruction::PtrToInt:
22832 case Instruction::IntToPtr:
22833 case Instruction::SIToFP:
22834 case Instruction::UIToFP:
22835 case Instruction::Trunc:
22836 case Instruction::FPTrunc:
22837 case Instruction::BitCast: {
22838 setInsertPointAfterBundle(
E);
22840 Value *InVec = vectorizeOperand(
E, 0);
22845 auto SrcIt = MinBWs.find(getOperandEntry(
E, 0));
22847 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
22850 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
22851 if (SrcIt != MinBWs.end())
22852 SrcBWSz = SrcIt->second.first;
22853 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->
getScalarType());
22854 if (BWSz == SrcBWSz) {
22855 VecOpcode = Instruction::BitCast;
22856 }
else if (BWSz < SrcBWSz) {
22857 VecOpcode = Instruction::Trunc;
22858 }
else if (It != MinBWs.end()) {
22859 assert(BWSz > SrcBWSz &&
"Invalid cast!");
22860 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
22861 }
else if (SrcIt != MinBWs.end()) {
22862 assert(BWSz > SrcBWSz &&
"Invalid cast!");
22864 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
22866 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
22867 !SrcIt->second.second) {
22868 VecOpcode = Instruction::UIToFP;
22869 }
else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
22871 Type *OrigSrcScalarTy = CI->getSrcTy();
22872 auto *OrigSrcVectorTy =
22875 Builder.CreateIntCast(InVec, OrigSrcVectorTy, SrcIt->second.second);
22877 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
22879 : Builder.CreateCast(VecOpcode, InVec, VecTy);
22880 V = FinalShuffle(V,
E);
22882 E->VectorizedValue =
V;
22883 ++NumVectorInstructions;
22886 case Instruction::FCmp:
22887 case Instruction::ICmp: {
22888 setInsertPointAfterBundle(
E);
22890 Value *
L = vectorizeOperand(
E, 0);
22891 Value *
R = vectorizeOperand(
E, 1);
22892 if (
L->getType() !=
R->getType()) {
22895 MinBWs.contains(getOperandEntry(
E, 0)) ||
22896 MinBWs.contains(getOperandEntry(
E, 1))) &&
22897 "Expected item in MinBWs.");
22900 ->getIntegerBitWidth();
22903 ->getIntegerBitWidth();
22908 auto *CI = dyn_cast<ConstantInt>(V);
22910 CI->getValue().getActiveBits() > LBW;
22914 auto *CI = dyn_cast<ConstantInt>(V);
22915 return CI && CI->getValue().getActiveBits() <= RBW;
22917 Type *CastTy =
R->getType();
22918 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
22920 Type *CastTy =
L->getType();
22921 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
22926 Value *
V = Builder.CreateCmp(P0, L, R);
22929 ICmp->setSameSign(
false);
22932 V = FinalShuffle(V,
E);
22934 E->VectorizedValue =
V;
22935 ++NumVectorInstructions;
22938 case Instruction::Select: {
22939 setInsertPointAfterBundle(
E);
22942 Value *True = vectorizeOperand(
E, 1);
22943 Value *False = vectorizeOperand(
E, 2);
22947 MinBWs.contains(getOperandEntry(
E, 1)) ||
22948 MinBWs.contains(getOperandEntry(
E, 2))) &&
22949 "Expected item in MinBWs.");
22950 if (True->
getType() != VecTy)
22951 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
22952 if (False->
getType() != VecTy)
22953 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
22958 assert(TrueNumElements >= CondNumElements &&
22959 TrueNumElements % CondNumElements == 0 &&
22960 "Cannot vectorize Instruction::Select");
22962 "Cannot vectorize Instruction::Select");
22963 if (CondNumElements != TrueNumElements) {
22966 Cond = Builder.CreateShuffleVector(
22971 "Cannot vectorize Instruction::Select");
22973 Builder.CreateSelectWithUnknownProfile(
Cond, True, False,
DEBUG_TYPE);
22974 V = FinalShuffle(V,
E);
22976 E->VectorizedValue =
V;
22977 ++NumVectorInstructions;
22980 case Instruction::FNeg: {
22981 setInsertPointAfterBundle(
E);
22983 Value *
Op = vectorizeOperand(
E, 0);
22985 Value *
V = Builder.CreateUnOp(
22991 V = FinalShuffle(V,
E);
22993 E->VectorizedValue =
V;
22994 ++NumVectorInstructions;
22998 case Instruction::Freeze: {
22999 setInsertPointAfterBundle(
E);
23001 Value *
Op = vectorizeOperand(
E, 0);
23003 if (
Op->getType() != VecTy) {
23005 MinBWs.contains(getOperandEntry(
E, 0))) &&
23006 "Expected item in MinBWs.");
23007 Op = Builder.CreateIntCast(
Op, VecTy, GetOperandSignedness(0));
23009 Value *
V = Builder.CreateFreeze(
Op);
23010 V = FinalShuffle(V,
E);
23012 E->VectorizedValue =
V;
23013 ++NumVectorInstructions;
23017 case Instruction::Add:
23018 case Instruction::FAdd:
23019 case Instruction::Sub:
23020 case Instruction::FSub:
23021 case Instruction::Mul:
23022 case Instruction::FMul:
23023 case Instruction::UDiv:
23024 case Instruction::SDiv:
23025 case Instruction::FDiv:
23026 case Instruction::URem:
23027 case Instruction::SRem:
23028 case Instruction::FRem:
23029 case Instruction::Shl:
23030 case Instruction::LShr:
23031 case Instruction::AShr:
23032 case Instruction::And:
23033 case Instruction::Or:
23034 case Instruction::Xor: {
23035 setInsertPointAfterBundle(
E);
23039 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
23044 return CI && CI->getValue().countr_one() >= It->second.first;
23046 V = FinalShuffle(
I == 0 ?
RHS :
LHS,
E);
23047 E->VectorizedValue =
V;
23048 ++NumVectorInstructions;
23056 MinBWs.contains(getOperandEntry(
E, 0)) ||
23057 MinBWs.contains(getOperandEntry(
E, 1))) &&
23058 "Expected item in MinBWs.");
23060 LHS = Builder.CreateIntCast(
LHS, VecTy, GetOperandSignedness(0));
23062 RHS = Builder.CreateIntCast(
RHS, VecTy, GetOperandSignedness(1));
23065 Value *
V = Builder.CreateBinOp(
23072 if (!MinBWs.contains(
E) && ShuffleOrOp == Instruction::Sub &&
23074 return isa<PoisonValue>(V) ||
23075 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
23076 isCommutative(cast<Instruction>(V));
23078 I->setHasNoUnsignedWrap(
false);
23081 V = FinalShuffle(V,
E);
23083 E->VectorizedValue =
V;
23084 ++NumVectorInstructions;
23088 case Instruction::Load: {
23091 setInsertPointAfterBundle(
E);
23095 FixedVectorType *StridedLoadTy =
nullptr;
23096 Value *PO = LI->getPointerOperand();
23097 if (
E->State == TreeEntry::Vectorize) {
23098 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
23099 }
else if (
E->State == TreeEntry::CompressVectorize) {
23100 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
23101 CompressEntryToData.at(
E);
23102 Align CommonAlignment = LI->getAlign();
23108 for (
int I : CompressMask)
23112 MaskValues =
replicateMask(MaskValues, VecTy->getNumElements());
23115 NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
23118 NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
23129 }
else if (
E->State == TreeEntry::StridedVectorize) {
23132 PO = IsReverseOrder ? PtrN : Ptr0;
23133 Type *StrideTy = DL->getIndexType(PO->
getType());
23136 StridedLoadTy = SPtrInfo.Ty;
23137 assert(StridedLoadTy &&
"Missing StridedPointerInfo for tree entry.");
23138 unsigned StridedLoadEC =
23141 Value *Stride = SPtrInfo.StrideVal;
23143 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
23144 assert(StrideSCEV &&
"Neither StrideVal nor StrideSCEV were set.");
23145 SCEVExpander Expander(*SE,
"strided-load-vec");
23146 Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->
getType(),
23147 &*Builder.GetInsertPoint());
23150 Builder.CreateIntCast(Stride, StrideTy,
true);
23151 StrideVal = Builder.CreateMul(
23153 StrideTy, (IsReverseOrder ? -1 : 1) *
23155 DL->getTypeAllocSize(ScalarTy))));
23157 auto *Inst = Builder.CreateIntrinsic(
23158 Intrinsic::experimental_vp_strided_load,
23159 {StridedLoadTy, PO->
getType(), StrideTy},
23162 Builder.getInt32(StridedLoadEC)});
23163 Inst->addParamAttr(
23165 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23168 assert(
E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
23169 Value *VecPtr = vectorizeOperand(
E, 0);
23176 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
23177 "Cannot expand getelementptr.");
23178 unsigned VF = VecTyNumElements / ScalarTyNumElements;
23181 return Builder.getInt64(I % ScalarTyNumElements);
23183 VecPtr = Builder.CreateGEP(
23184 VecTy->getElementType(),
23185 Builder.CreateShuffleVector(
23191 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
23193 Value *
V =
E->State == TreeEntry::CompressVectorize
23197 if (StridedLoadTy != VecTy)
23198 V = Builder.CreateBitOrPointerCast(V, VecTy);
23199 V = FinalShuffle(V,
E);
23200 E->VectorizedValue =
V;
23201 ++NumVectorInstructions;
23204 case Instruction::Store: {
23207 setInsertPointAfterBundle(
E);
23209 Value *VecValue = vectorizeOperand(
E, 0);
23210 if (VecValue->
getType() != VecTy)
23212 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
23213 VecValue = FinalShuffle(VecValue,
E);
23215 Value *Ptr =
SI->getPointerOperand();
23217 if (
E->State == TreeEntry::Vectorize) {
23218 ST = Builder.CreateAlignedStore(VecValue, Ptr,
SI->getAlign());
23220 assert(
E->State == TreeEntry::StridedVectorize &&
23221 "Expected either strided or consecutive stores.");
23222 if (!
E->ReorderIndices.empty()) {
23224 Ptr =
SI->getPointerOperand();
23227 Type *StrideTy = DL->getIndexType(
SI->getPointerOperandType());
23230 Value *Stride = SPtrInfo.StrideVal;
23231 assert(Stride &&
"Missing StridedPointerInfo for tree entry.");
23233 Builder.CreateIntCast(Stride, StrideTy,
true);
23235 StrideVal = Builder.CreateMul(
23238 StrideTy,
static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
23239 auto *Inst = Builder.CreateIntrinsic(
23240 Intrinsic::experimental_vp_strided_store,
23241 {VecTy, Ptr->
getType(), StrideTy},
23242 {VecValue, Ptr, StrideVal,
23243 Builder.getAllOnesMask(VecTy->getElementCount()),
23244 Builder.getInt32(
E->Scalars.size())});
23245 Inst->addParamAttr(
23247 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
23253 E->VectorizedValue =
V;
23254 ++NumVectorInstructions;
23257 case Instruction::GetElementPtr: {
23259 setInsertPointAfterBundle(
E);
23261 Value *Op0 = vectorizeOperand(
E, 0);
23264 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
23265 Value *OpVec = vectorizeOperand(
E, J);
23269 Value *
V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
23272 for (
Value *V :
E->Scalars) {
23279 V = FinalShuffle(V,
E);
23281 E->VectorizedValue =
V;
23282 ++NumVectorInstructions;
23286 case Instruction::Call: {
23288 setInsertPointAfterBundle(
E);
23293 CI,
ID, VecTy->getNumElements(),
23294 It != MinBWs.end() ? It->second.first : 0, TTI);
23297 VecCallCosts.first <= VecCallCosts.second;
23299 Value *ScalarArg =
nullptr;
23310 ScalarArg = CEI->getArgOperand(
I);
23313 if (
ID == Intrinsic::abs && It != MinBWs.end() &&
23314 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
23315 ScalarArg = Builder.getFalse();
23322 Value *OpVec = vectorizeOperand(
E,
I);
23323 ScalarArg = CEI->getArgOperand(
I);
23326 It == MinBWs.end()) {
23329 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
23330 }
else if (It != MinBWs.end()) {
23331 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
23340 if (!UseIntrinsic) {
23345 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
23352 Value *
V = Builder.CreateCall(CF, OpVecs, OpBundles);
23356 V = FinalShuffle(V,
E);
23358 E->VectorizedValue =
V;
23359 ++NumVectorInstructions;
23362 case Instruction::ShuffleVector: {
23365 setInsertPointAfterBundle(
E);
23366 Value *Src = vectorizeOperand(
E, 0);
23369 SmallVector<int> NewMask(ThisMask.size());
23371 return SVSrc->getShuffleMask()[Mask];
23373 V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
23374 SVSrc->getOperand(1), NewMask);
23376 V = Builder.CreateShuffleVector(Src, ThisMask);
23381 V = FinalShuffle(V,
E);
23389 "Invalid Shuffle Vector Operand");
23393 setInsertPointAfterBundle(
E);
23394 LHS = vectorizeOperand(
E, 0);
23395 RHS = vectorizeOperand(
E, 1);
23397 setInsertPointAfterBundle(
E);
23398 LHS = vectorizeOperand(
E, 0);
23404 assert((It != MinBWs.end() ||
23405 getOperandEntry(
E, 0)->State == TreeEntry::NeedToGather ||
23406 getOperandEntry(
E, 1)->State == TreeEntry::NeedToGather ||
23407 MinBWs.contains(getOperandEntry(
E, 0)) ||
23408 MinBWs.contains(getOperandEntry(
E, 1))) &&
23409 "Expected item in MinBWs.");
23410 Type *CastTy = VecTy;
23416 ->getIntegerBitWidth())
23422 LHS = Builder.CreateIntCast(
LHS, CastTy, GetOperandSignedness(0));
23424 RHS = Builder.CreateIntCast(
RHS, CastTy, GetOperandSignedness(1));
23429 V0 = Builder.CreateBinOp(
23431 V1 = Builder.CreateBinOp(
23434 V0 = Builder.CreateCmp(CI0->getPredicate(),
LHS,
RHS);
23437 V1 = Builder.CreateCmp(AltPred,
LHS,
RHS);
23440 unsigned SrcBWSz = DL->getTypeSizeInBits(
23442 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
23443 if (BWSz <= SrcBWSz) {
23444 if (BWSz < SrcBWSz)
23445 LHS = Builder.CreateIntCast(
LHS, VecTy, It->second.first);
23447 "Expected same type as operand.");
23451 E->VectorizedValue =
LHS;
23452 ++NumVectorInstructions;
23456 V0 = Builder.CreateCast(
23458 V1 = Builder.CreateCast(
23463 for (
Value *V : {V0, V1}) {
23465 GatherShuffleExtractSeq.insert(
I);
23466 CSEBlocks.insert(
I->getParent());
23474 SmallVector<int>
Mask;
23475 E->buildAltOpShuffleMask(
23476 [
E,
this](Instruction *
I) {
23477 assert(
E->getMatchingMainOpOrAltOp(
I) &&
23478 "Unexpected main/alternate opcode");
23482 Mask, &OpScalars, &AltScalars);
23486 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
23489 I && Opcode == Instruction::Sub && !MinBWs.contains(
E) &&
23491 if (isa<PoisonValue>(V))
23493 if (E->hasCopyableElements() && E->isCopyableElement(V))
23495 auto *IV = cast<Instruction>(V);
23496 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
23498 I->setHasNoUnsignedWrap(
false);
23500 DropNuwFlag(V0,
E->getOpcode());
23501 DropNuwFlag(V1,
E->getAltOpcode());
23507 V = Builder.CreateShuffleVector(V0, V1, Mask);
23510 GatherShuffleExtractSeq.insert(
I);
23511 CSEBlocks.insert(
I->getParent());
23515 E->VectorizedValue =
V;
23516 ++NumVectorInstructions;
23520 case TreeEntry::ReducedBitcast:
23521 case TreeEntry::ReducedBitcastBSwap: {
23522 assert(UserIgnoreList &&
"Expected reduction operations only.");
23523 setInsertPointAfterBundle(
E);
23524 TreeEntry *ZExt = getOperandEntry(
E, 0);
23526 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
23527 TreeEntry *
Const = getOperandEntry(
E, 1);
23529 Const->Scalars.front()->getType(),
Const->getVectorFactor()));
23530 Value *
Op = vectorizeOperand(ZExt, 0);
23533 DL->getTypeSizeInBits(
cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
23534 E->getVectorFactor());
23535 auto *OrigScalarTy = ScalarTy;
23538 Op = FinalShuffle(
Op,
E);
23539 auto *
V = Builder.CreateBitCast(
Op, SrcType);
23540 ++NumVectorInstructions;
23541 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
23542 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
23543 ++NumVectorInstructions;
23545 if (SrcType != OrigScalarTy) {
23546 V = Builder.CreateIntCast(V, OrigScalarTy,
false);
23547 ++NumVectorInstructions;
23549 E->VectorizedValue =
V;
23552 case TreeEntry::ReducedBitcastLoads:
23553 case TreeEntry::ReducedBitcastBSwapLoads: {
23554 assert(UserIgnoreList &&
"Expected reduction operations only.");
23555 TreeEntry *ZExt = getOperandEntry(
E, 0);
23556 TreeEntry *
Load = getOperandEntry(ZExt, 0);
23557 setInsertPointAfterBundle(Load);
23559 ZExt->getMainOp()->getType(), ZExt->getVectorFactor()));
23560 TreeEntry *
Const = getOperandEntry(
E, 1);
23562 Const->Scalars.front()->getType(),
Const->getVectorFactor()));
23564 Load->getMainOp()->getType(),
Load->getVectorFactor()));
23566 Value *PO = LI->getPointerOperand();
23569 DL->getTypeSizeInBits(
cast<CastInst>(ZExt->getMainOp())->getSrcTy()) *
23570 E->getVectorFactor());
23571 auto *OrigScalarTy = ScalarTy;
23572 ScalarTy = ZExt->getMainOp()->getType();
23573 Value *
V = Builder.CreateAlignedLoad(SrcTy, PO, LI->getAlign());
23574 ++NumVectorInstructions;
23575 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
23576 V = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, V);
23577 ++NumVectorInstructions;
23579 if (SrcTy != OrigScalarTy) {
23580 V = Builder.CreateIntCast(V, OrigScalarTy,
false);
23581 ++NumVectorInstructions;
23583 E->VectorizedValue =
V;
23586 case TreeEntry::ReducedCmpBitcast: {
23587 assert(UserIgnoreList &&
"Expected reduction operations only.");
23588 setInsertPointAfterBundle(
E);
23589 TreeEntry *Op1TE = getOperandEntry(
E, 1);
23590 TreeEntry *Op2TE = getOperandEntry(
E, 2);
23591 Op1TE->VectorizedValue =
23593 Op2TE->VectorizedValue =
23598 IntegerType::getIntNTy(ScalarTy->
getContext(),
E->getVectorFactor());
23599 auto *
V = Builder.CreateBitCast(Cmp, DstTy);
23600 ++NumVectorInstructions;
23601 if (DstTy != ScalarTy) {
23602 V = Builder.CreateIntCast(V, ScalarTy,
false);
23603 ++NumVectorInstructions;
23605 E->VectorizedValue =
V;
23622 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
23623 VectorValuesAndScales) {
23626 EntryToLastInstruction.clear();
23628 for (
auto &BSIter : BlocksSchedules)
23629 scheduleBlock(*
this, BSIter.second.get());
23632 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
23635 if (TE->isGather() || DeletedNodes.contains(TE.get()) ||
23636 (TE->State == TreeEntry::CombinedVectorize &&
23637 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
23638 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
23639 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
23640 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
23641 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
23642 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
23644 (void)getLastInstructionInBundle(TE.get());
23648 Builder.SetInsertPoint(ReductionRoot->
getParent(),
23651 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
23659 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
23660 if (DeletedNodes.contains(TE.get()))
23662 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
23663 TE->UserTreeIndex.UserTE->hasState() &&
23664 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
23665 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
23666 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
23667 !TE->UserTreeIndex.UserTE->hasCopyableElements()) {
23668 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
23669 auto [It, Inserted] =
23670 UserTEScalarsUsedOutsideBlockCache.
try_emplace(UserTE);
23672 It->second =
all_of(UserTE->Scalars,
23673 [](
Value *V) { return isUsedOutsideBlock(V); });
23676 Instruction &LastInst = getLastInstructionInBundle(UserTE);
23680 for (
auto &Entry : GatherEntries) {
23682 Builder.SetInsertPoint(Entry.second);
23683 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
23688 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
23689 if (DeletedNodes.contains(TE.get()))
23691 if (GatheredLoadsEntriesFirst.has_value() &&
23692 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
23693 (!TE->isGather() || TE->UserTreeIndex)) {
23694 assert((TE->UserTreeIndex ||
23695 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
23696 "Expected gathered load node.");
23705 for (
const TreeEntry *E : PostponedNodes) {
23706 auto *TE =
const_cast<TreeEntry *
>(E);
23708 TE->VectorizedValue =
nullptr;
23719 (TE->UserTreeIndex.UserTE->hasState() &&
23720 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
23721 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
23730 if (UI->comesBefore(InsertPt))
23733 Builder.SetInsertPoint(InsertPt);
23735 Builder.SetInsertPoint(PrevVec);
23737 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
23740 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
23741 Builder.GetInsertPoint()->comesBefore(VecI))
23742 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
23743 Builder.GetInsertPoint());
23744 if (Vec->
getType() != PrevVec->getType()) {
23746 PrevVec->getType()->isIntOrIntVectorTy() &&
23747 "Expected integer vector types only.");
23748 std::optional<bool> IsSigned;
23749 for (
Value *V : TE->Scalars) {
23751 for (
const TreeEntry *MNTE : getTreeEntries(V)) {
23752 auto It = MinBWs.find(MNTE);
23753 if (It != MinBWs.end()) {
23754 IsSigned = IsSigned.value_or(
false) || It->second.second;
23759 if (IsSigned.value_or(
false))
23762 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
23763 auto It = MinBWs.find(BVE);
23764 if (It != MinBWs.end()) {
23765 IsSigned = IsSigned.value_or(
false) || It->second.second;
23770 if (IsSigned.value_or(
false))
23774 IsSigned.value_or(
false) ||
23778 if (IsSigned.value_or(
false))
23782 if (IsSigned.value_or(
false)) {
23784 auto It = MinBWs.find(TE->UserTreeIndex.UserTE);
23785 if (It != MinBWs.end())
23786 IsSigned = It->second.second;
23789 "Expected user node or perfect diamond match in MinBWs.");
23790 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
23792 PrevVec->replaceAllUsesWith(Vec);
23793 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
23796 auto It = PostponedValues.
find(PrevVec);
23797 if (It != PostponedValues.
end()) {
23798 for (TreeEntry *VTE : It->getSecond())
23799 VTE->VectorizedValue = Vec;
23819 for (
const auto &ExternalUse : ExternalUses) {
23820 Value *Scalar = ExternalUse.Scalar;
23827 const TreeEntry *E = &ExternalUse.E;
23828 assert(E &&
"Invalid scalar");
23829 assert(!E->isGather() &&
"Extracting from a gather list");
23831 if (E->getOpcode() == Instruction::GetElementPtr &&
23835 Value *Vec = E->VectorizedValue;
23836 assert(Vec &&
"Can't find vectorizable value");
23838 Value *Lane = Builder.getInt32(ExternalUse.Lane);
23839 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
23840 if (Scalar->getType() != Vec->
getType()) {
23841 Value *Ex =
nullptr;
23842 Value *ExV =
nullptr;
23844 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
23845 auto It = ScalarToEEs.
find(Scalar);
23846 if (It != ScalarToEEs.
end()) {
23849 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
23850 : Builder.GetInsertBlock());
23851 if (EEIt != It->second.end()) {
23852 Value *PrevV = EEIt->second.first;
23854 I && !ReplaceInst &&
23855 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
23856 Builder.GetInsertPoint()->comesBefore(
I)) {
23857 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
23858 Builder.GetInsertPoint());
23863 ExV = EEIt->second.second ? EEIt->second.second : Ex;
23872 IgnoredExtracts.
insert(EE);
23875 auto *CloneInst = Inst->clone();
23876 CloneInst->insertBefore(Inst->getIterator());
23877 if (Inst->hasName())
23878 CloneInst->takeName(Inst);
23883 Value *V = ES->getVectorOperand();
23886 V = ETEs.front()->VectorizedValue;
23888 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
23889 IV->comesBefore(IVec))
23890 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
23892 Ex = Builder.CreateExtractElement(Vec, Lane);
23893 }
else if (
auto *VecTy =
23896 unsigned VecTyNumElements = VecTy->getNumElements();
23901 ExternalUse.Lane * VecTyNumElements);
23903 Ex = Builder.CreateExtractElement(Vec, Lane);
23908 if (Scalar->getType() != Ex->
getType())
23909 ExV = Builder.CreateIntCast(
23914 : &F->getEntryBlock(),
23915 std::make_pair(Ex, ExV));
23921 GatherShuffleExtractSeq.insert(ExI);
23922 CSEBlocks.insert(ExI->getParent());
23928 "In-tree scalar of vector type is not insertelement?");
23937 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
23940 (ExternallyUsedValues.
count(Scalar) ||
23941 ExternalUsesWithNonUsers.count(Scalar) ||
23942 ExternalUsesAsOriginalScalar.contains(Scalar) ||
23946 if (ExternalUsesAsOriginalScalar.contains(U))
23948 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
23949 return !UseEntries.empty() &&
23950 (E->State == TreeEntry::Vectorize ||
23951 E->State == TreeEntry::StridedVectorize ||
23952 E->State == TreeEntry::CompressVectorize) &&
23953 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
23954 return (UseEntry->State == TreeEntry::Vectorize ||
23956 TreeEntry::StridedVectorize ||
23958 TreeEntry::CompressVectorize) &&
23959 doesInTreeUserNeedToExtract(
23960 Scalar, getRootEntryInstruction(*UseEntry),
23964 "Scalar with nullptr User must be registered in "
23965 "ExternallyUsedValues map or remain as scalar in vectorized "
23969 if (
PHI->getParent()->isLandingPad())
23970 Builder.SetInsertPoint(
23973 PHI->getParent()->getLandingPadInst()->getIterator()));
23975 Builder.SetInsertPoint(
PHI->getParent(),
23976 PHI->getParent()->getFirstNonPHIIt());
23978 Builder.SetInsertPoint(VecI->getParent(),
23979 std::next(VecI->getIterator()));
23982 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
23984 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
23986 if (Scalar != NewInst) {
23989 "Extractelements should not be replaced.");
23990 Scalar->replaceAllUsesWith(NewInst);
24000 if (!UsedInserts.
insert(VU).second)
24003 auto BWIt = MinBWs.find(E);
24005 auto *ScalarTy = FTy->getElementType();
24006 auto Key = std::make_pair(Vec, ScalarTy);
24007 auto VecIt = VectorCasts.
find(
Key);
24008 if (VecIt == VectorCasts.
end()) {
24011 if (IVec->getParent()->isLandingPad())
24012 Builder.SetInsertPoint(IVec->getParent(),
24013 std::next(IVec->getParent()
24014 ->getLandingPadInst()
24017 Builder.SetInsertPoint(
24018 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
24020 Builder.SetInsertPoint(IVec->getNextNode());
24022 Vec = Builder.CreateIntCast(
24027 BWIt->second.second);
24030 Vec = VecIt->second;
24037 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
24044 unsigned Idx = *InsertIdx;
24045 if (It == ShuffledInserts.
end()) {
24047 It = std::next(ShuffledInserts.
begin(),
24048 ShuffledInserts.
size() - 1);
24053 Mask[Idx] = ExternalUse.Lane;
24065 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
24066 if (PH->getIncomingValue(
I) == Scalar) {
24068 PH->getIncomingBlock(
I)->getTerminator();
24070 Builder.SetInsertPoint(VecI->getParent(),
24071 std::next(VecI->getIterator()));
24073 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
24075 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
24076 PH->setOperand(
I, NewInst);
24081 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
24085 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
24086 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
24097 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
24099 CombinedMask1[
I] = Mask[
I];
24101 CombinedMask2[
I] = Mask[
I] - VF;
24103 ShuffleInstructionBuilder ShuffleBuilder(
24105 ShuffleBuilder.add(V1, CombinedMask1);
24107 ShuffleBuilder.add(V2, CombinedMask2);
24108 return ShuffleBuilder.finalize({}, {}, {});
24111 auto &&ResizeToVF = [&CreateShuffle](
Value *Vec, ArrayRef<int>
Mask,
24112 bool ForSingleMask) {
24113 unsigned VF =
Mask.size();
24116 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
24117 Vec = CreateShuffle(Vec,
nullptr, Mask);
24118 return std::make_pair(Vec,
true);
24120 if (!ForSingleMask) {
24122 for (
unsigned I = 0;
I < VF; ++
I) {
24126 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
24130 return std::make_pair(Vec,
false);
24134 for (
int I = 0,
E = ShuffledInserts.size();
I <
E; ++
I) {
24137 InsertElementInst *FirstInsert = ShuffledInserts[
I].InsertElements.front();
24138 InsertElementInst *LastInsert = ShuffledInserts[
I].InsertElements.back();
24139 Builder.SetInsertPoint(LastInsert);
24140 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
24145 return cast<VectorType>(Vec->getType())
24146 ->getElementCount()
24147 .getKnownMinValue();
24150 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
24152 assert((Vals.size() == 1 || Vals.size() == 2) &&
24153 "Expected exactly 1 or 2 input values.");
24154 if (Vals.size() == 1) {
24157 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
24158 ->getNumElements() ||
24159 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
24160 return CreateShuffle(Vals.front(), nullptr, Mask);
24161 return Vals.front();
24163 return CreateShuffle(Vals.
front() ? Vals.
front()
24165 Vals.
back(), Mask);
24167 auto It = ShuffledInserts[
I].InsertElements.rbegin();
24169 InsertElementInst *
II =
nullptr;
24170 if (It != ShuffledInserts[
I].InsertElements.rend())
24173 while (It != ShuffledInserts[
I].InsertElements.rend()) {
24174 assert(
II &&
"Must be an insertelement instruction.");
24181 for (Instruction *
II :
reverse(Inserts)) {
24182 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
24184 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
24185 II->moveAfter(NewI);
24189 for (InsertElementInst *IE :
reverse(ShuffledInserts[
I].InsertElements)) {
24190 IE->replaceUsesOfWith(
IE->getOperand(0),
24192 IE->replaceUsesOfWith(
IE->getOperand(1),
24196 CSEBlocks.insert(LastInsert->
getParent());
24201 for (
auto &TEPtr : VectorizableTree) {
24202 TreeEntry *
Entry = TEPtr.get();
24205 if (
Entry->isGather() ||
Entry->State == TreeEntry::SplitVectorize ||
24206 DeletedNodes.contains(Entry) ||
24207 TransformedToGatherNodes.contains(Entry))
24210 if (
Entry->CombinedOp == TreeEntry::ReducedBitcast ||
24211 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
24212 Entry->CombinedOp == TreeEntry::ReducedBitcastLoads ||
24213 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
24214 Entry->CombinedOp == TreeEntry::ReducedCmpBitcast) {
24216 if (!
Entry->hasState()) {
24223 if (!
I ||
Entry->isCopyableElement(
I))
24231 assert(
Entry->VectorizedValue &&
"Can't find vectorizable value");
24234 for (
int Lane = 0, LE =
Entry->Scalars.size(); Lane != LE; ++Lane) {
24237 if (
Entry->getOpcode() == Instruction::GetElementPtr &&
24241 EE && IgnoredExtracts.contains(EE))
24248 for (User *U :
Scalar->users()) {
24253 (UserIgnoreList && UserIgnoreList->contains(U)) ||
24256 "Deleting out-of-tree value");
24260 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
24274 Value *RootVec = VectorizableTree.front()->VectorizedValue;
24277 SmallPtrSet<Value *, 16> DeadSet(RemovedInsts.
begin(), RemovedInsts.
end());
24278 auto AllUsesAreDead = [&](
const Value *
V) {
24280 [&](
const User *U) { return DeadSet.contains(U); });
24282 SmallPtrSet<const Value *, 16> Candidates;
24283 SmallVector<Instruction *, 16> Worklist;
24284 for (
const auto &TEPtr : VectorizableTree) {
24285 Value *VV = TEPtr->VectorizedValue;
24287 DeadSet.contains(VV))
24292 if (Candidates.
insert(
I).second)
24298 while (!Worklist.
empty()) {
24300 if (DeadSet.contains(
I))
24302 if (!
I->use_empty() && !AllUsesAreDead(
I))
24306 for (
Value *
Op :
I->operand_values())
24308 if (Candidates.
contains(OI) && !DeadSet.contains(OI))
24316 V->mergeDIAssignID(RemovedInsts);
24319 if (UserIgnoreList) {
24320 for (Instruction *
I : RemovedInsts) {
24324 if (Entries.
empty())
24326 const TreeEntry *
IE = Entries.
front();
24328 !SplitEntries.empty() && SplitEntries.front()->Idx <
IE->Idx)
24329 IE = SplitEntries.front();
24330 if (
IE->Idx != 0 &&
24331 !(VectorizableTree.front()->isGather() &&
IE->UserTreeIndex &&
24332 (ValueToGatherNodes.lookup(
I).contains(
24333 VectorizableTree.front().get()) ||
24334 (
IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
24335 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
24336 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
24337 IE->UserTreeIndex &&
24339 !(GatheredLoadsEntriesFirst.has_value() &&
24340 IE->Idx >= *GatheredLoadsEntriesFirst &&
24341 VectorizableTree.front()->isGather() &&
24343 !(!VectorizableTree.front()->isGather() &&
24344 VectorizableTree.front()->isCopyableElement(
I)))
24349 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
24350 (match(U.getUser(), m_LogicalAnd()) ||
24351 match(U.getUser(), m_LogicalOr())) &&
24352 U.getOperandNo() == 0;
24353 if (IsPoisoningLogicalOp) {
24354 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
24357 return UserIgnoreList->contains(
U.getUser());
24361 for (SelectInst *SI : LogicalOpSelects)
24371 Builder.ClearInsertionPoint();
24372 InstrElementSize.clear();
24374 const TreeEntry &RootTE = *VectorizableTree.front();
24375 Value *Vec = RootTE.VectorizedValue;
24376 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
24377 It != MinBWs.end() &&
24378 ReductionBitWidth != It->second.first) {
24379 IRBuilder<>::InsertPointGuard Guard(Builder);
24380 Builder.SetInsertPoint(ReductionRoot->getParent(),
24381 ReductionRoot->getIterator());
24383 Vec = Builder.CreateIntCast(Vec, Builder.getIntNTy(ReductionBitWidth),
24384 It->second.second);
24387 Vec = Builder.CreateIntCast(
24389 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
24391 It->second.second);
24398 LLVM_DEBUG(
dbgs() <<
"SLP: Optimizing " << GatherShuffleExtractSeq.size()
24399 <<
" gather sequences instructions.\n");
24406 Loop *L = LI->getLoopFor(
I->getParent());